[llvm] [AMDGPU][NFC] Enable gfx942 for more tests (PR #154363)

Janek van Oirschot via llvm-commits llvm-commits at lists.llvm.org
Tue Aug 19 08:31:54 PDT 2025


https://github.com/JanekvO created https://github.com/llvm/llvm-project/pull/154363

Enable gfx942 for tests that are affected by the an AMDGPU bitcast constant combine (#154115)

Expecting to see more tests affected in aforementioned PR after rebase on top of this PR

>From 60f026d1241adf8c92f8f1ca50efcb3020e9e4f8 Mon Sep 17 00:00:00 2001
From: Janek van Oirschot <janek.vanoirschot at amd.com>
Date: Tue, 19 Aug 2025 16:24:55 +0100
Subject: [PATCH] [AMDGPU][NFC] Enable gfx942 for more tests

---
 .../AMDGPU/amdgpu-codegenprepare-idiv.ll      | 3519 ++++++++++++++
 .../atomic_optimizations_global_pointer.ll    | 1211 +++++
 .../atomic_optimizations_local_pointer.ll     | 2519 ++++++++++
 llvm/test/CodeGen/AMDGPU/bypass-div.ll        | 1270 +++++
 .../CodeGen/AMDGPU/dagcombine-fmul-sel.ll     |  907 ++++
 llvm/test/CodeGen/AMDGPU/dagcombine-select.ll |  220 +-
 llvm/test/CodeGen/AMDGPU/div_i128.ll          | 4154 +++++++++++++++++
 llvm/test/CodeGen/AMDGPU/div_v2i128.ll        | 3548 ++++++++++++++
 .../CodeGen/AMDGPU/extract_vector_dynelt.ll   | 1034 ++++
 llvm/test/CodeGen/AMDGPU/fcanonicalize.ll     |  605 +++
 llvm/test/CodeGen/AMDGPU/fceil64.ll           |   88 +
 .../CodeGen/AMDGPU/fence-lds-read2-write2.ll  |   71 +-
 llvm/test/CodeGen/AMDGPU/fptoi.i128.ll        | 2236 +++++++++
 .../AMDGPU/global_atomics_scan_fadd.ll        | 1675 +++++++
 .../AMDGPU/global_atomics_scan_fmax.ll        | 1213 +++++
 .../AMDGPU/global_atomics_scan_fmin.ll        | 1213 +++++
 .../AMDGPU/global_atomics_scan_fsub.ll        | 1984 ++++++++
 llvm/test/CodeGen/AMDGPU/imm.ll               |  761 +++
 .../CodeGen/AMDGPU/llvm.amdgcn.writelane.ll   | 1089 +++++
 llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll    |  200 +
 .../AMDGPU/lower-lds-with-alias-scope.ll      |   65 +-
 llvm/test/CodeGen/AMDGPU/lround.ll            |  448 ++
 .../AMDGPU/module-lds-false-sharing.ll        |   33 +
 llvm/test/CodeGen/AMDGPU/rem_i128.ll          | 2688 ++++++++++-
 llvm/test/CodeGen/AMDGPU/sdiv64.ll            | 1931 ++++++++
 llvm/test/CodeGen/AMDGPU/shift-i128.ll        |  550 +++
 .../si-optimize-vgpr-live-range-dbg-instr.ll  |   38 +
 llvm/test/CodeGen/AMDGPU/sibling-call.ll      |  559 +++
 llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll    |  155 +
 llvm/test/CodeGen/AMDGPU/srem.ll              | 1657 +++++++
 llvm/test/CodeGen/AMDGPU/srem64.ll            | 2087 +++++++++
 llvm/test/CodeGen/AMDGPU/udiv64.ll            | 1523 ++++++
 llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll    |  222 +
 llvm/test/CodeGen/AMDGPU/urem64.ll            | 1471 ++++++
 .../CodeGen/AMDGPU/vector_range_metadata.ll   |  135 +-
 .../CodeGen/AMDGPU/waterfall_kills_scc.ll     |   47 +
 .../CodeGen/AMDGPU/widen-vselect-and-mask.ll  |   53 +
 37 files changed, 43060 insertions(+), 119 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index c7385e4324e2c..c6ad5c93fb7fa 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -2,6 +2,7 @@
 ; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare -amdgpu-bypass-slow-div=0 %s | FileCheck %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX6 %s
 ; RUN: llc -mtriple=amdgcn-- -mcpu=gfx900 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-- -mcpu=gfx942 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX942 %s
 
 define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; CHECK-LABEL: @udiv_i32(
@@ -98,6 +99,37 @@ define amdgpu_kernel void @udiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT:    s_sub_i32 s4, 0, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX942-NEXT:    s_mul_i32 s5, s4, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s5
+; GFX942-NEXT:    s_add_i32 s6, s4, 1
+; GFX942-NEXT:    s_sub_i32 s5, s2, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX942-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-NEXT:    s_add_i32 s5, s4, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = udiv i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
   ret void
@@ -191,6 +223,35 @@ define amdgpu_kernel void @urem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT:    s_sub_i32 s4, 0, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX942-NEXT:    s_mul_i32 s4, s4, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s4
+; GFX942-NEXT:    s_sub_i32 s4, s2, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX942-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX942-NEXT:    s_sub_i32 s4, s2, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX942-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = urem i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
   ret void
@@ -312,6 +373,42 @@ define amdgpu_kernel void @sdiv_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_abs_i32 s4, s3
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX942-NEXT:    s_sub_i32 s5, 0, s4
+; GFX942-NEXT:    s_xor_b32 s3, s2, s3
+; GFX942-NEXT:    s_abs_i32 s2, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX942-NEXT:    s_mul_i32 s5, s5, s6
+; GFX942-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX942-NEXT:    s_add_i32 s6, s6, s5
+; GFX942-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX942-NEXT:    s_mul_i32 s6, s5, s4
+; GFX942-NEXT:    s_sub_i32 s2, s2, s6
+; GFX942-NEXT:    s_add_i32 s7, s5, 1
+; GFX942-NEXT:    s_sub_i32 s6, s2, s4
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s4
+; GFX942-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX942-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX942-NEXT:    s_add_i32 s6, s5, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s4
+; GFX942-NEXT:    s_cselect_b32 s2, s6, s5
+; GFX942-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
   ret void
@@ -423,6 +520,40 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, i32 %x, i32 %y) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_abs_i32 s3, s3
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT:    s_sub_i32 s5, 0, s3
+; GFX942-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX942-NEXT:    s_abs_i32 s2, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX942-NEXT:    s_mul_i32 s5, s5, s6
+; GFX942-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX942-NEXT:    s_add_i32 s6, s6, s5
+; GFX942-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX942-NEXT:    s_mul_i32 s5, s5, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s5
+; GFX942-NEXT:    s_sub_i32 s5, s2, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-NEXT:    s_sub_i32 s5, s2, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-NEXT:    s_xor_b32 s2, s2, s4
+; GFX942-NEXT:    s_sub_i32 s2, s2, s4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = srem i32 %x, %y
   store i32 %r, ptr addrspace(1) %out
   ret void
@@ -492,6 +623,29 @@ define amdgpu_kernel void @udiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v3, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GFX942-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s0
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GFX942-NEXT:    v_fma_f32 v1, -v2, v0, v1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_short v3, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = udiv i16 %x, %y
   store i16 %r, ptr addrspace(1) %out
   ret void
@@ -567,6 +721,31 @@ define amdgpu_kernel void @urem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s3, s2, 16
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT:    s_and_b32 s0, s2, 0xffff
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s0
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GFX942-NEXT:    v_fma_f32 v1, -v2, v0, v1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v0, v0, s3
+; GFX942-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_short v3, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = urem i16 %x, %y
   store i16 %r, ptr addrspace(1) %out
   ret void
@@ -648,6 +827,31 @@ define amdgpu_kernel void @sdiv_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    v_add_u32_e32 v0, s2, v3
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s3, s2, 16
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX942-NEXT:    s_sext_i32_i16 s2, s2
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, s2
+; GFX942-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    s_or_b32 s4, s2, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-NEXT:    v_add_u32_e32 v0, s2, v3
+; GFX942-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv i16 %x, %y
   store i16 %r, ptr addrspace(1) %out
   ret void
@@ -735,6 +939,33 @@ define amdgpu_kernel void @srem_i16(ptr addrspace(1) %out, i16 %x, i16 %y) {
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s6, v0
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s4, s6, 16
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX942-NEXT:    s_sext_i32_i16 s2, s6
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, s2
+; GFX942-NEXT:    s_xor_b32 s2, s2, s4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    s_or_b32 s5, s2, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s5, 0
+; GFX942-NEXT:    v_add_u32_e32 v0, s2, v3
+; GFX942-NEXT:    v_mul_lo_u32 v0, v0, s4
+; GFX942-NEXT:    v_sub_u32_e32 v0, s6, v0
+; GFX942-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = srem i16 %x, %y
   store i16 %r, ptr addrspace(1) %out
   ret void
@@ -798,6 +1029,25 @@ define amdgpu_kernel void @udiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
 ; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_i8:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX942-NEXT:    v_cvt_f32_ubyte0_e32 v3, s2
+; GFX942-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v1
+; GFX942-NEXT:    v_fma_f32 v1, -v1, v0, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT:    global_store_byte v2, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = udiv i8 %x, %y
   store i8 %r, ptr addrspace(1) %out
   ret void
@@ -869,6 +1119,28 @@ define amdgpu_kernel void @urem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s2, v0
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_i8:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_ubyte1_e32 v0, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX942-NEXT:    v_cvt_f32_ubyte0_e32 v3, s2
+; GFX942-NEXT:    s_lshr_b32 s3, s2, 8
+; GFX942-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v1
+; GFX942-NEXT:    v_fma_f32 v1, -v1, v0, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v0, v0, s3
+; GFX942-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX942-NEXT:    global_store_byte v2, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = urem i8 %x, %y
   store i8 %r, ptr addrspace(1) %out
   ret void
@@ -950,6 +1222,31 @@ define amdgpu_kernel void @sdiv_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    v_add_u32_e32 v0, s2, v3
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_i8:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_bfe_i32 s3, s2, 0x80008
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX942-NEXT:    s_sext_i32_i8 s2, s2
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, s2
+; GFX942-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    s_or_b32 s4, s2, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-NEXT:    v_add_u32_e32 v0, s2, v3
+; GFX942-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv i8 %x, %y
   store i8 %r, ptr addrspace(1) %out
   ret void
@@ -1039,6 +1336,34 @@ define amdgpu_kernel void @srem_i8(ptr addrspace(1) %out, i8 %x, i8 %y) {
 ; GFX9-NEXT:    v_sub_u32_e32 v0, s6, v0
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_i8:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_bfe_i32 s2, s6, 0x80008
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v1, s2
+; GFX942-NEXT:    s_sext_i32_i8 s3, s6
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, s3
+; GFX942-NEXT:    s_xor_b32 s2, s3, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    s_lshr_b32 s4, s6, 8
+; GFX942-NEXT:    s_or_b32 s5, s2, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v1|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s5, 0
+; GFX942-NEXT:    v_add_u32_e32 v1, s2, v3
+; GFX942-NEXT:    v_mul_lo_u32 v1, v1, s4
+; GFX942-NEXT:    v_sub_u32_e32 v1, s6, v1
+; GFX942-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = srem i8 %x, %y
   store i8 %r, ptr addrspace(1) %out
   ret void
@@ -1367,6 +1692,99 @@ define amdgpu_kernel void @udiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_v4i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s13
+; GFX942-NEXT:    s_sub_i32 s2, 0, s12
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, s14
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX942-NEXT:    s_mul_i32 s2, s2, s3
+; GFX942-NEXT:    s_mul_hi_u32 s2, s3, s2
+; GFX942-NEXT:    s_add_i32 s3, s3, s2
+; GFX942-NEXT:    s_mul_hi_u32 s2, s8, s3
+; GFX942-NEXT:    s_mul_i32 s3, s2, s12
+; GFX942-NEXT:    s_sub_i32 s3, s8, s3
+; GFX942-NEXT:    s_add_i32 s5, s2, 1
+; GFX942-NEXT:    s_sub_i32 s6, s3, s12
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s12
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-NEXT:    s_cselect_b32 s3, s6, s3
+; GFX942-NEXT:    s_add_i32 s5, s2, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s12
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-NEXT:    s_sub_i32 s3, 0, s13
+; GFX942-NEXT:    s_mul_i32 s3, s3, s4
+; GFX942-NEXT:    s_mul_hi_u32 s3, s4, s3
+; GFX942-NEXT:    s_add_i32 s4, s4, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v3
+; GFX942-NEXT:    s_mul_hi_u32 s3, s9, s4
+; GFX942-NEXT:    s_mul_i32 s4, s3, s13
+; GFX942-NEXT:    s_sub_i32 s4, s9, s4
+; GFX942-NEXT:    s_add_i32 s5, s3, 1
+; GFX942-NEXT:    s_sub_i32 s6, s4, s13
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    s_cmp_ge_u32 s4, s13
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX942-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX942-NEXT:    s_add_i32 s5, s3, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s4, s13
+; GFX942-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s15
+; GFX942-NEXT:    s_sub_i32 s4, 0, s14
+; GFX942-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_mul_hi_u32 s4, s10, s5
+; GFX942-NEXT:    s_mul_i32 s5, s4, s14
+; GFX942-NEXT:    s_sub_i32 s5, s10, s5
+; GFX942-NEXT:    s_add_i32 s6, s4, 1
+; GFX942-NEXT:    s_sub_i32 s7, s5, s14
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    s_cmp_ge_u32 s5, s14
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX942-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX942-NEXT:    s_add_i32 s6, s4, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s5, s14
+; GFX942-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX942-NEXT:    s_sub_i32 s5, 0, s15
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX942-NEXT:    s_mul_i32 s5, s5, s6
+; GFX942-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX942-NEXT:    s_add_i32 s6, s6, s5
+; GFX942-NEXT:    s_mul_hi_u32 s5, s11, s6
+; GFX942-NEXT:    s_mul_i32 s6, s5, s15
+; GFX942-NEXT:    s_sub_i32 s6, s11, s6
+; GFX942-NEXT:    s_add_i32 s7, s5, 1
+; GFX942-NEXT:    s_sub_i32 s8, s6, s15
+; GFX942-NEXT:    s_cmp_ge_u32 s6, s15
+; GFX942-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX942-NEXT:    s_cselect_b32 s6, s8, s6
+; GFX942-NEXT:    s_add_i32 s7, s5, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s6, s15
+; GFX942-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v5, s3
+; GFX942-NEXT:    v_mov_b32_e32 v6, s4
+; GFX942-NEXT:    v_mov_b32_e32 v7, s5
+; GFX942-NEXT:    global_store_dwordx4 v2, v[4:7], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = udiv <4 x i32> %x, %y
   store <4 x i32> %r, ptr addrspace(1) %out
   ret void
@@ -1668,6 +2086,91 @@ define amdgpu_kernel void @urem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_v4i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s12
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, s13
+; GFX942-NEXT:    s_sub_i32 s2, 0, s12
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, s14
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX942-NEXT:    s_mul_i32 s2, s2, s3
+; GFX942-NEXT:    s_mul_hi_u32 s2, s3, s2
+; GFX942-NEXT:    s_add_i32 s3, s3, s2
+; GFX942-NEXT:    s_mul_hi_u32 s2, s8, s3
+; GFX942-NEXT:    s_mul_i32 s2, s2, s12
+; GFX942-NEXT:    s_sub_i32 s2, s8, s2
+; GFX942-NEXT:    s_sub_i32 s3, s2, s12
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s12
+; GFX942-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX942-NEXT:    s_sub_i32 s3, s2, s12
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s12
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX942-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX942-NEXT:    s_sub_i32 s3, 0, s13
+; GFX942-NEXT:    s_mul_i32 s3, s3, s4
+; GFX942-NEXT:    s_mul_hi_u32 s3, s4, s3
+; GFX942-NEXT:    s_add_i32 s4, s4, s3
+; GFX942-NEXT:    s_mul_hi_u32 s3, s9, s4
+; GFX942-NEXT:    s_mul_i32 s3, s3, s13
+; GFX942-NEXT:    s_sub_i32 s3, s9, s3
+; GFX942-NEXT:    s_sub_i32 s4, s3, s13
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s13
+; GFX942-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX942-NEXT:    s_sub_i32 s4, s3, s13
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s13
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s15
+; GFX942-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX942-NEXT:    s_sub_i32 s4, 0, s14
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX942-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT:    s_mul_hi_u32 s4, s10, s5
+; GFX942-NEXT:    s_mul_i32 s4, s4, s14
+; GFX942-NEXT:    s_sub_i32 s4, s10, s4
+; GFX942-NEXT:    s_sub_i32 s5, s4, s14
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX942-NEXT:    s_cmp_ge_u32 s4, s14
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX942-NEXT:    s_sub_i32 s5, s4, s14
+; GFX942-NEXT:    s_cmp_ge_u32 s4, s14
+; GFX942-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX942-NEXT:    s_sub_i32 s5, 0, s15
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX942-NEXT:    s_mul_i32 s5, s5, s6
+; GFX942-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX942-NEXT:    s_add_i32 s6, s6, s5
+; GFX942-NEXT:    s_mul_hi_u32 s5, s11, s6
+; GFX942-NEXT:    s_mul_i32 s5, s5, s15
+; GFX942-NEXT:    s_sub_i32 s5, s11, s5
+; GFX942-NEXT:    s_sub_i32 s6, s5, s15
+; GFX942-NEXT:    s_cmp_ge_u32 s5, s15
+; GFX942-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX942-NEXT:    s_sub_i32 s6, s5, s15
+; GFX942-NEXT:    s_cmp_ge_u32 s5, s15
+; GFX942-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_mov_b32_e32 v5, s5
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = urem <4 x i32> %x, %y
   store <4 x i32> %r, ptr addrspace(1) %out
   ret void
@@ -2081,6 +2584,125 @@ define amdgpu_kernel void @sdiv_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_v4i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_abs_i32 s0, s12
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX942-NEXT:    s_sub_i32 s3, 0, s0
+; GFX942-NEXT:    s_abs_i32 s2, s8
+; GFX942-NEXT:    s_xor_b32 s1, s8, s12
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_ashr_i32 s1, s1, 31
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX942-NEXT:    s_mul_i32 s3, s3, s6
+; GFX942-NEXT:    s_mul_hi_u32 s3, s6, s3
+; GFX942-NEXT:    s_add_i32 s6, s6, s3
+; GFX942-NEXT:    s_mul_hi_u32 s3, s2, s6
+; GFX942-NEXT:    s_mul_i32 s6, s3, s0
+; GFX942-NEXT:    s_sub_i32 s2, s2, s6
+; GFX942-NEXT:    s_add_i32 s7, s3, 1
+; GFX942-NEXT:    s_sub_i32 s6, s2, s0
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX942-NEXT:    s_cselect_b32 s3, s7, s3
+; GFX942-NEXT:    s_cselect_b32 s2, s6, s2
+; GFX942-NEXT:    s_add_i32 s6, s3, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX942-NEXT:    s_cselect_b32 s0, s6, s3
+; GFX942-NEXT:    s_abs_i32 s2, s13
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-NEXT:    s_xor_b32 s0, s0, s1
+; GFX942-NEXT:    s_sub_i32 s7, 0, s2
+; GFX942-NEXT:    s_sub_i32 s8, s0, s1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_abs_i32 s6, s9
+; GFX942-NEXT:    s_xor_b32 s3, s9, s13
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s8
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX942-NEXT:    s_mul_i32 s7, s7, s0
+; GFX942-NEXT:    s_mul_hi_u32 s1, s0, s7
+; GFX942-NEXT:    s_add_i32 s0, s0, s1
+; GFX942-NEXT:    s_mul_hi_u32 s0, s6, s0
+; GFX942-NEXT:    s_mul_i32 s1, s0, s2
+; GFX942-NEXT:    s_sub_i32 s1, s6, s1
+; GFX942-NEXT:    s_add_i32 s7, s0, 1
+; GFX942-NEXT:    s_sub_i32 s6, s1, s2
+; GFX942-NEXT:    s_cmp_ge_u32 s1, s2
+; GFX942-NEXT:    s_cselect_b32 s0, s7, s0
+; GFX942-NEXT:    s_cselect_b32 s1, s6, s1
+; GFX942-NEXT:    s_add_i32 s6, s0, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s1, s2
+; GFX942-NEXT:    s_cselect_b32 s0, s6, s0
+; GFX942-NEXT:    s_abs_i32 s1, s14
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GFX942-NEXT:    s_xor_b32 s0, s0, s3
+; GFX942-NEXT:    s_sub_i32 s7, 0, s1
+; GFX942-NEXT:    s_sub_i32 s3, s0, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_abs_i32 s6, s10
+; GFX942-NEXT:    s_xor_b32 s2, s10, s14
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 31
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX942-NEXT:    s_mul_i32 s7, s7, s0
+; GFX942-NEXT:    s_mul_hi_u32 s7, s0, s7
+; GFX942-NEXT:    s_add_i32 s0, s0, s7
+; GFX942-NEXT:    s_mul_hi_u32 s0, s6, s0
+; GFX942-NEXT:    s_mul_i32 s7, s0, s1
+; GFX942-NEXT:    s_sub_i32 s6, s6, s7
+; GFX942-NEXT:    s_add_i32 s9, s0, 1
+; GFX942-NEXT:    s_sub_i32 s7, s6, s1
+; GFX942-NEXT:    s_cmp_ge_u32 s6, s1
+; GFX942-NEXT:    s_cselect_b32 s0, s9, s0
+; GFX942-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX942-NEXT:    s_add_i32 s7, s0, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s6, s1
+; GFX942-NEXT:    s_cselect_b32 s6, s7, s0
+; GFX942-NEXT:    s_abs_i32 s7, s15
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_xor_b32 s5, s6, s2
+; GFX942-NEXT:    s_sub_i32 s6, 0, s7
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_sub_i32 s2, s5, s2
+; GFX942-NEXT:    s_abs_i32 s4, s11
+; GFX942-NEXT:    s_xor_b32 s3, s11, s15
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s6, s6, s5
+; GFX942-NEXT:    s_mul_hi_u32 s6, s5, s6
+; GFX942-NEXT:    s_add_i32 s5, s5, s6
+; GFX942-NEXT:    s_mul_hi_u32 s5, s4, s5
+; GFX942-NEXT:    s_mul_i32 s6, s5, s7
+; GFX942-NEXT:    s_sub_i32 s4, s4, s6
+; GFX942-NEXT:    s_add_i32 s8, s5, 1
+; GFX942-NEXT:    s_sub_i32 s6, s4, s7
+; GFX942-NEXT:    s_cmp_ge_u32 s4, s7
+; GFX942-NEXT:    s_cselect_b32 s5, s8, s5
+; GFX942-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX942-NEXT:    s_add_i32 s6, s5, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s4, s7
+; GFX942-NEXT:    s_cselect_b32 s4, s6, s5
+; GFX942-NEXT:    s_xor_b32 s4, s4, s3
+; GFX942-NEXT:    s_sub_i32 s3, s4, s3
+; GFX942-NEXT:    v_mov_b32_e32 v5, s3
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx4 v1, v[2:5], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv <4 x i32> %x, %y
   store <4 x i32> %r, ptr addrspace(1) %out
   ret void
@@ -2455,6 +3077,116 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, <4 x i32> %x, <4 x
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_v4i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_abs_i32 s0, s12
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s0
+; GFX942-NEXT:    s_sub_i32 s3, 0, s0
+; GFX942-NEXT:    s_abs_i32 s2, s8
+; GFX942-NEXT:    s_ashr_i32 s1, s8, 31
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX942-NEXT:    s_mul_i32 s3, s3, s6
+; GFX942-NEXT:    s_mul_hi_u32 s3, s6, s3
+; GFX942-NEXT:    s_add_i32 s6, s6, s3
+; GFX942-NEXT:    s_mul_hi_u32 s3, s2, s6
+; GFX942-NEXT:    s_mul_i32 s3, s3, s0
+; GFX942-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-NEXT:    s_sub_i32 s3, s2, s0
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX942-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX942-NEXT:    s_sub_i32 s3, s2, s0
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s0
+; GFX942-NEXT:    s_cselect_b32 s0, s3, s2
+; GFX942-NEXT:    s_abs_i32 s2, s13
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-NEXT:    s_xor_b32 s0, s0, s1
+; GFX942-NEXT:    s_sub_i32 s7, 0, s2
+; GFX942-NEXT:    s_sub_i32 s8, s0, s1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_abs_i32 s6, s9
+; GFX942-NEXT:    s_ashr_i32 s3, s9, 31
+; GFX942-NEXT:    v_mov_b32_e32 v2, s8
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX942-NEXT:    s_mul_i32 s7, s7, s0
+; GFX942-NEXT:    s_mul_hi_u32 s1, s0, s7
+; GFX942-NEXT:    s_add_i32 s0, s0, s1
+; GFX942-NEXT:    s_mul_hi_u32 s0, s6, s0
+; GFX942-NEXT:    s_mul_i32 s0, s0, s2
+; GFX942-NEXT:    s_sub_i32 s0, s6, s0
+; GFX942-NEXT:    s_sub_i32 s1, s0, s2
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s2
+; GFX942-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX942-NEXT:    s_sub_i32 s1, s0, s2
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s2
+; GFX942-NEXT:    s_cselect_b32 s0, s1, s0
+; GFX942-NEXT:    s_abs_i32 s1, s14
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s1
+; GFX942-NEXT:    s_xor_b32 s0, s0, s3
+; GFX942-NEXT:    s_sub_i32 s7, 0, s1
+; GFX942-NEXT:    s_sub_i32 s3, s0, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_abs_i32 s6, s10
+; GFX942-NEXT:    s_ashr_i32 s2, s10, 31
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX942-NEXT:    s_mul_i32 s7, s7, s0
+; GFX942-NEXT:    s_mul_hi_u32 s7, s0, s7
+; GFX942-NEXT:    s_add_i32 s0, s0, s7
+; GFX942-NEXT:    s_mul_hi_u32 s0, s6, s0
+; GFX942-NEXT:    s_mul_i32 s0, s0, s1
+; GFX942-NEXT:    s_sub_i32 s0, s6, s0
+; GFX942-NEXT:    s_sub_i32 s6, s0, s1
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s1
+; GFX942-NEXT:    s_cselect_b32 s0, s6, s0
+; GFX942-NEXT:    s_sub_i32 s6, s0, s1
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s1
+; GFX942-NEXT:    s_cselect_b32 s6, s6, s0
+; GFX942-NEXT:    s_abs_i32 s7, s15
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_xor_b32 s5, s6, s2
+; GFX942-NEXT:    s_sub_i32 s6, 0, s7
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_sub_i32 s2, s5, s2
+; GFX942-NEXT:    s_abs_i32 s4, s11
+; GFX942-NEXT:    s_ashr_i32 s3, s11, 31
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s6, s6, s5
+; GFX942-NEXT:    s_mul_hi_u32 s6, s5, s6
+; GFX942-NEXT:    s_add_i32 s5, s5, s6
+; GFX942-NEXT:    s_mul_hi_u32 s5, s4, s5
+; GFX942-NEXT:    s_mul_i32 s5, s5, s7
+; GFX942-NEXT:    s_sub_i32 s4, s4, s5
+; GFX942-NEXT:    s_sub_i32 s5, s4, s7
+; GFX942-NEXT:    s_cmp_ge_u32 s4, s7
+; GFX942-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX942-NEXT:    s_sub_i32 s5, s4, s7
+; GFX942-NEXT:    s_cmp_ge_u32 s4, s7
+; GFX942-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX942-NEXT:    s_xor_b32 s4, s4, s3
+; GFX942-NEXT:    s_sub_i32 s3, s4, s3
+; GFX942-NEXT:    v_mov_b32_e32 v5, s3
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx4 v1, v[2:5], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = srem <4 x i32> %x, %y
   store <4 x i32> %r, ptr addrspace(1) %out
   ret void
@@ -2662,6 +3394,64 @@ define amdgpu_kernel void @udiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_v4i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s7, s2, 0xffff
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX942-NEXT:    s_and_b32 s6, s0, 0xffff
+; GFX942-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, s0
+; GFX942-NEXT:    v_mul_f32_e32 v4, v2, v4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v5, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX942-NEXT:    s_and_b32 s0, s3, 0xffff
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GFX942-NEXT:    v_fma_f32 v2, -v4, v0, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v4, s0
+; GFX942-NEXT:    v_mul_f32_e32 v5, v3, v5
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX942-NEXT:    s_and_b32 s0, s1, 0xffff
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v5
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v5, s0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX942-NEXT:    v_fma_f32 v3, -v2, v1, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
+; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
+; GFX942-NEXT:    v_mul_f32_e32 v1, v5, v7
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_fma_f32 v3, -v1, v4, v5
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v5, s0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v7, s0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v8, v5
+; GFX942-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
+; GFX942-NEXT:    v_mul_f32_e32 v3, v7, v8
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v3
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX942-NEXT:    v_fma_f32 v3, -v3, v5, v7
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
+; GFX942-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX942-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX942-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %r = udiv <4 x i16> %x, %y
   store <4 x i16> %r, ptr addrspace(1) %out
   ret void
@@ -2892,6 +3682,71 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
 ; GFX9-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_v4i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s9, s2, 0xffff
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX942-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GFX942-NEXT:    s_and_b32 s8, s0, 0xffff
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, s8
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, s0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v5, v1
+; GFX942-NEXT:    v_mul_f32_e32 v4, v2, v4
+; GFX942-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX942-NEXT:    s_and_b32 s4, s3, 0xffff
+; GFX942-NEXT:    v_mul_f32_e32 v5, v3, v5
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GFX942-NEXT:    v_fma_f32 v2, -v4, v0, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v4, s4
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v5
+; GFX942-NEXT:    v_fma_f32 v3, -v2, v1, v3
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    s_and_b32 s5, s1, 0xffff
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v5, s5
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
+; GFX942-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX942-NEXT:    v_mul_lo_u32 v0, v0, s9
+; GFX942-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v1, v1, s2
+; GFX942-NEXT:    v_sub_u32_e32 v2, s0, v1
+; GFX942-NEXT:    v_mul_f32_e32 v1, v5, v7
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    s_lshr_b32 s0, s3, 16
+; GFX942-NEXT:    v_fma_f32 v3, -v1, v4, v5
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v5, s0
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v7, s1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v8, v5
+; GFX942-NEXT:    v_sub_u32_e32 v0, s8, v0
+; GFX942-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v3, v7, v8
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v3
+; GFX942-NEXT:    v_fma_f32 v3, -v3, v5, v7
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v5
+; GFX942-NEXT:    v_mul_lo_u32 v1, v1, s4
+; GFX942-NEXT:    v_sub_u32_e32 v1, s5, v1
+; GFX942-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v4, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v3, v3, s0
+; GFX942-NEXT:    v_sub_u32_e32 v3, s1, v3
+; GFX942-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX942-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %r = urem <4 x i16> %x, %y
   store <4 x i16> %r, ptr addrspace(1) %out
   ret void
@@ -3154,6 +4009,83 @@ define amdgpu_kernel void @sdiv_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_v4i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_sext_i32_i16 s4, s2
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX942-NEXT:    s_sext_i32_i16 s5, s0
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v1, s5
+; GFX942-NEXT:    s_xor_b32 s4, s5, s4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX942-NEXT:    s_or_b32 s8, s4, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v1, -v3, v0, v1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GFX942-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX942-NEXT:    s_cselect_b32 s4, s8, 0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX942-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v1, s0
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX942-NEXT:    s_xor_b32 s0, s0, s2
+; GFX942-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX942-NEXT:    s_sext_i32_i16 s2, s3
+; GFX942-NEXT:    v_mul_f32_e32 v4, v1, v4
+; GFX942-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX942-NEXT:    v_fma_f32 v1, -v4, v0, v1
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX942-NEXT:    v_add_u32_e32 v3, s4, v3
+; GFX942-NEXT:    s_or_b32 s0, s0, 1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX942-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX942-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX942-NEXT:    v_add_u32_e32 v4, s0, v4
+; GFX942-NEXT:    s_sext_i32_i16 s0, s1
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v1, s0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v5, v0
+; GFX942-NEXT:    s_xor_b32 s0, s0, s2
+; GFX942-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX942-NEXT:    s_or_b32 s0, s0, 1
+; GFX942-NEXT:    v_mul_f32_e32 v5, v1, v5
+; GFX942-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-NEXT:    v_fma_f32 v1, -v5, v0, v1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GFX942-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX942-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX942-NEXT:    s_ashr_i32 s2, s3, 16
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX942-NEXT:    v_add_u32_e32 v1, s0, v5
+; GFX942-NEXT:    s_ashr_i32 s0, s1, 16
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v5, s0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v6, v0
+; GFX942-NEXT:    s_xor_b32 s0, s0, s2
+; GFX942-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX942-NEXT:    s_or_b32 s2, s0, 1
+; GFX942-NEXT:    v_mul_f32_e32 v6, v5, v6
+; GFX942-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX942-NEXT:    v_fma_f32 v5, -v6, v0, v5
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v6, v6
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v0|
+; GFX942-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX942-NEXT:    s_cselect_b32 s0, s2, 0
+; GFX942-NEXT:    v_add_u32_e32 v0, s0, v6
+; GFX942-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX942-NEXT:    v_lshl_or_b32 v1, v0, 16, v1
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff, v3
+; GFX942-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv <4 x i16> %x, %y
   store <4 x i16> %r, ptr addrspace(1) %out
   ret void
@@ -3444,6 +4376,91 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x
 ; GFX9-NEXT:    v_lshl_or_b32 v0, v0, 16, v3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_v4i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_sext_i32_i16 s8, s2
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GFX942-NEXT:    s_sext_i32_i16 s9, s0
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v1, s9
+; GFX942-NEXT:    s_xor_b32 s4, s9, s8
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX942-NEXT:    s_or_b32 s10, s4, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v1, -v3, v0, v1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v0|
+; GFX942-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX942-NEXT:    s_cselect_b32 s4, s10, 0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v1, s2
+; GFX942-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v3
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v3, s0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v4, v1
+; GFX942-NEXT:    s_xor_b32 s4, s0, s2
+; GFX942-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX942-NEXT:    v_mul_lo_u32 v0, v0, s8
+; GFX942-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX942-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX942-NEXT:    v_fma_f32 v3, -v4, v1, v3
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX942-NEXT:    s_or_b32 s8, s4, 1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v1|
+; GFX942-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX942-NEXT:    s_cselect_b32 s4, s8, 0
+; GFX942-NEXT:    v_add_u32_e32 v1, s4, v4
+; GFX942-NEXT:    v_mul_lo_u32 v1, v1, s2
+; GFX942-NEXT:    s_sext_i32_i16 s2, s3
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v3, s2
+; GFX942-NEXT:    v_sub_u32_e32 v4, s0, v1
+; GFX942-NEXT:    s_sext_i32_i16 s0, s1
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v1, s0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX942-NEXT:    s_xor_b32 s4, s0, s2
+; GFX942-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX942-NEXT:    s_or_b32 s8, s4, 1
+; GFX942-NEXT:    v_mul_f32_e32 v5, v1, v5
+; GFX942-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-NEXT:    v_fma_f32 v1, -v5, v3, v1
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, |v3|
+; GFX942-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX942-NEXT:    s_cselect_b32 s4, s8, 0
+; GFX942-NEXT:    v_add_u32_e32 v1, s4, v5
+; GFX942-NEXT:    v_mul_lo_u32 v1, v1, s2
+; GFX942-NEXT:    s_ashr_i32 s2, s3, 16
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v3, s2
+; GFX942-NEXT:    s_ashr_i32 s3, s1, 16
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v5, s3
+; GFX942-NEXT:    v_sub_u32_e32 v1, s0, v1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v6, v3
+; GFX942-NEXT:    s_xor_b32 s0, s3, s2
+; GFX942-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX942-NEXT:    s_or_b32 s4, s0, 1
+; GFX942-NEXT:    v_mul_f32_e32 v6, v5, v6
+; GFX942-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX942-NEXT:    v_fma_f32 v5, -v6, v3, v5
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v6, v6
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, |v3|
+; GFX942-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX942-NEXT:    s_cselect_b32 s0, s4, 0
+; GFX942-NEXT:    v_add_u32_e32 v3, s0, v6
+; GFX942-NEXT:    v_sub_u32_e32 v0, s9, v0
+; GFX942-NEXT:    v_mul_lo_u32 v3, v3, s2
+; GFX942-NEXT:    v_sub_u32_e32 v3, s3, v3
+; GFX942-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT:    v_lshl_or_b32 v1, v3, 16, v1
+; GFX942-NEXT:    v_lshl_or_b32 v0, v4, 16, v0
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %r = srem <4 x i16> %x, %y
   store <4 x i16> %r, ptr addrspace(1) %out
   ret void
@@ -3513,6 +4530,28 @@ define amdgpu_kernel void @udiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX9-NEXT:    global_store_byte v2, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_i3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_bfe_u32 s3, s2, 0x30008
+; GFX942-NEXT:    v_cvt_f32_ubyte0_e32 v0, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX942-NEXT:    s_and_b32 s2, s2, 7
+; GFX942-NEXT:    v_cvt_f32_ubyte0_e32 v3, s2
+; GFX942-NEXT:    v_mul_f32_e32 v1, v3, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v1
+; GFX942-NEXT:    v_fma_f32 v1, -v1, v0, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT:    v_and_b32_e32 v0, 7, v0
+; GFX942-NEXT:    global_store_byte v2, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = udiv i3 %x, %y
   store i3 %r, ptr addrspace(1) %out
   ret void
@@ -3591,6 +4630,31 @@ define amdgpu_kernel void @urem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_i3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_bfe_u32 s3, s2, 0x30008
+; GFX942-NEXT:    v_cvt_f32_ubyte0_e32 v1, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GFX942-NEXT:    s_and_b32 s4, s2, 7
+; GFX942-NEXT:    v_cvt_f32_ubyte0_e32 v3, s4
+; GFX942-NEXT:    s_lshr_b32 s3, s2, 8
+; GFX942-NEXT:    v_mul_f32_e32 v2, v3, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GFX942-NEXT:    v_fma_f32 v2, -v2, v1, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v4, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v1, v1, s3
+; GFX942-NEXT:    v_sub_u32_e32 v1, s2, v1
+; GFX942-NEXT:    v_and_b32_e32 v1, 7, v1
+; GFX942-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = urem i3 %x, %y
   store i3 %r, ptr addrspace(1) %out
   ret void
@@ -3674,6 +4738,32 @@ define amdgpu_kernel void @sdiv_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    v_and_b32_e32 v0, 7, v0
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_i3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_bfe_i32 s3, s2, 0x30008
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s3
+; GFX942-NEXT:    s_bfe_i32 s2, s2, 0x30000
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, s2
+; GFX942-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    s_or_b32 s4, s2, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-NEXT:    v_add_u32_e32 v0, s2, v3
+; GFX942-NEXT:    v_and_b32_e32 v0, 7, v0
+; GFX942-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv i3 %x, %y
   store i3 %r, ptr addrspace(1) %out
   ret void
@@ -3766,6 +4856,35 @@ define amdgpu_kernel void @srem_i3(ptr addrspace(1) %out, i3 %x, i3 %y) {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_byte v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_i3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_bfe_i32 s2, s6, 0x30008
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v1, s2
+; GFX942-NEXT:    s_bfe_i32 s3, s6, 0x30000
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, s3
+; GFX942-NEXT:    s_xor_b32 s2, s3, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v1
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    s_lshr_b32 s4, s6, 8
+; GFX942-NEXT:    s_or_b32 s5, s2, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v1, v2
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v1|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s5, 0
+; GFX942-NEXT:    v_add_u32_e32 v1, s2, v3
+; GFX942-NEXT:    v_mul_lo_u32 v1, v1, s4
+; GFX942-NEXT:    v_sub_u32_e32 v1, s6, v1
+; GFX942-NEXT:    v_and_b32_e32 v1, 7, v1
+; GFX942-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = srem i3 %x, %y
   store i3 %r, ptr addrspace(1) %out
   ret void
@@ -3928,6 +5047,53 @@ define amdgpu_kernel void @udiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    global_store_short v6, v2, s[6:7] offset:4
 ; GFX9-NEXT:    global_store_dword v6, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_v3i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s7, s2, 0xffff
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX942-NEXT:    s_and_b32 s6, s0, 0xffff
+; GFX942-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, s0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v5, v1
+; GFX942-NEXT:    v_mul_f32_e32 v4, v2, v4
+; GFX942-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX942-NEXT:    s_and_b32 s0, s3, 0xffff
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GFX942-NEXT:    v_fma_f32 v2, -v4, v0, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v4, s0
+; GFX942-NEXT:    v_mul_f32_e32 v5, v3, v5
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v5
+; GFX942-NEXT:    s_and_b32 s0, s1, 0xffff
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_fma_f32 v3, -v2, v1, v3
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v5, s0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v2, v5, v7
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX942-NEXT:    v_fma_f32 v2, -v2, v4, v5
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
+; GFX942-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_short v6, v2, s[6:7] offset:4
+; GFX942-NEXT:    global_store_dword v6, v0, s[6:7]
+; GFX942-NEXT:    s_endpgm
   %r = udiv <3 x i16> %x, %y
   store <3 x i16> %r, ptr addrspace(1) %out
   ret void
@@ -4107,6 +5273,57 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    global_store_short v3, v2, s[6:7] offset:4
 ; GFX9-NEXT:    global_store_dword v3, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_v3i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s9, s2, 0xffff
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s9
+; GFX942-NEXT:    s_and_b32 s8, s0, 0xffff
+; GFX942-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, s8
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX942-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, s0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v5, v1
+; GFX942-NEXT:    v_mul_f32_e32 v4, v2, v4
+; GFX942-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX942-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v7, v4
+; GFX942-NEXT:    v_fma_f32 v2, -v4, v0, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v4, s3
+; GFX942-NEXT:    v_mul_f32_e32 v5, v3, v5
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v5
+; GFX942-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_fma_f32 v3, -v2, v1, v3
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v5, s1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v7, v4
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v1
+; GFX942-NEXT:    v_mul_lo_u32 v0, v0, s9
+; GFX942-NEXT:    v_sub_u32_e32 v0, s8, v0
+; GFX942-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v2, v5, v7
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX942-NEXT:    v_fma_f32 v2, -v2, v4, v5
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v4
+; GFX942-NEXT:    v_mul_lo_u32 v1, v1, s2
+; GFX942-NEXT:    v_sub_u32_e32 v1, s0, v1
+; GFX942-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v2, v2, s3
+; GFX942-NEXT:    v_sub_u32_e32 v2, s1, v2
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX942-NEXT:    global_store_short v6, v2, s[6:7] offset:4
+; GFX942-NEXT:    global_store_dword v6, v0, s[6:7]
+; GFX942-NEXT:    s_endpgm
   %r = urem <3 x i16> %x, %y
   store <3 x i16> %r, ptr addrspace(1) %out
   ret void
@@ -4310,6 +5527,66 @@ define amdgpu_kernel void @sdiv_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    global_store_short v1, v0, s[6:7] offset:4
 ; GFX9-NEXT:    global_store_dword v1, v2, s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_v3i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_sext_i32_i16 s4, s2
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX942-NEXT:    s_sext_i32_i16 s5, s0
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, s5
+; GFX942-NEXT:    s_xor_b32 s4, s5, s4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX942-NEXT:    s_or_b32 s8, s4, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
+; GFX942-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX942-NEXT:    s_cselect_b32 s4, s8, 0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX942-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX942-NEXT:    v_add_u32_e32 v2, s4, v3
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v3, s0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v4, v0
+; GFX942-NEXT:    s_xor_b32 s0, s0, s2
+; GFX942-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX942-NEXT:    s_sext_i32_i16 s2, s3
+; GFX942-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX942-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX942-NEXT:    v_fma_f32 v3, -v4, v0, v3
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX942-NEXT:    s_or_b32 s0, s0, 1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v0|
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX942-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX942-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX942-NEXT:    v_add_u32_e32 v3, s0, v4
+; GFX942-NEXT:    s_sext_i32_i16 s0, s1
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v4, s0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v5, v0
+; GFX942-NEXT:    s_xor_b32 s0, s0, s2
+; GFX942-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX942-NEXT:    s_or_b32 s2, s0, 1
+; GFX942-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX942-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-NEXT:    v_fma_f32 v4, -v5, v0, v4
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v0|
+; GFX942-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX942-NEXT:    s_cselect_b32 s0, s2, 0
+; GFX942-NEXT:    v_add_u32_e32 v0, s0, v5
+; GFX942-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; GFX942-NEXT:    v_lshl_or_b32 v2, v3, 16, v2
+; GFX942-NEXT:    global_store_short v1, v0, s[6:7] offset:4
+; GFX942-NEXT:    global_store_dword v1, v2, s[6:7]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv <3 x i16> %x, %y
   store <3 x i16> %r, ptr addrspace(1) %out
   ret void
@@ -4533,6 +5810,72 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x
 ; GFX9-NEXT:    global_store_short v3, v2, s[6:7] offset:4
 ; GFX9-NEXT:    global_store_dword v3, v0, s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_v3i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_sext_i32_i16 s8, s2
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s8
+; GFX942-NEXT:    s_sext_i32_i16 s9, s0
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, s9
+; GFX942-NEXT:    s_xor_b32 s4, s9, s8
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX942-NEXT:    s_or_b32 s10, s4, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v2|, |v0|
+; GFX942-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX942-NEXT:    s_cselect_b32 s4, s10, 0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 16
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, s2
+; GFX942-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v3
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v3, s0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GFX942-NEXT:    s_xor_b32 s4, s0, s2
+; GFX942-NEXT:    s_ashr_i32 s4, s4, 30
+; GFX942-NEXT:    v_mul_lo_u32 v0, v0, s8
+; GFX942-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX942-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX942-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v4, v4
+; GFX942-NEXT:    s_or_b32 s8, s4, 1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v3|, |v2|
+; GFX942-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX942-NEXT:    s_cselect_b32 s4, s8, 0
+; GFX942-NEXT:    v_add_u32_e32 v2, s4, v4
+; GFX942-NEXT:    v_mul_lo_u32 v2, v2, s2
+; GFX942-NEXT:    s_sext_i32_i16 s2, s3
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v3, s2
+; GFX942-NEXT:    s_sext_i32_i16 s3, s1
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v4, s3
+; GFX942-NEXT:    v_sub_u32_e32 v2, s0, v2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX942-NEXT:    s_xor_b32 s0, s3, s2
+; GFX942-NEXT:    s_ashr_i32 s0, s0, 30
+; GFX942-NEXT:    s_or_b32 s4, s0, 1
+; GFX942-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX942-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v4|, |v3|
+; GFX942-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX942-NEXT:    s_cselect_b32 s0, s4, 0
+; GFX942-NEXT:    v_add_u32_e32 v3, s0, v5
+; GFX942-NEXT:    v_sub_u32_e32 v0, s9, v0
+; GFX942-NEXT:    v_mul_lo_u32 v3, v3, s2
+; GFX942-NEXT:    v_sub_u32_e32 v3, s3, v3
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX942-NEXT:    global_store_short v1, v3, s[6:7] offset:4
+; GFX942-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX942-NEXT:    s_endpgm
   %r = srem <3 x i16> %x, %y
   store <3 x i16> %r, ptr addrspace(1) %out
   ret void
@@ -4714,6 +6057,61 @@ define amdgpu_kernel void @udiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX9-NEXT:    global_store_short v2, v0, s[0:1] offset:4
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_v3i15:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    s_and_b32 s5, s6, 0x7fff
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GFX942-NEXT:    s_and_b32 s4, s2, 0x7fff
+; GFX942-NEXT:    v_alignbit_b32 v0, s3, v0, 30
+; GFX942-NEXT:    s_bfe_u32 s3, s6, 0xf000f
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v4, s4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v5, v1
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v6, s3
+; GFX942-NEXT:    s_bfe_u32 s2, s2, 0xf000f
+; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_alignbit_b32 v3, s7, v3, 30
+; GFX942-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v7, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v8, v6
+; GFX942-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-NEXT:    v_fma_f32 v4, -v5, v1, v4
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
+; GFX942-NEXT:    v_mul_f32_e32 v1, v7, v8
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v5, vcc
+; GFX942-NEXT:    v_fma_f32 v5, -v1, v6, v7
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v7, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v5|, v6
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v1, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v1, v0, v7
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v6, v1
+; GFX942-NEXT:    v_fma_f32 v0, -v1, v3, v0
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, v3
+; GFX942-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
+; GFX942-NEXT:    v_and_b32_e32 v4, 0x7fff, v5
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v6, vcc
+; GFX942-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
+; GFX942-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
+; GFX942-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX942-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX942-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
+; GFX942-NEXT:    global_store_short v2, v0, s[0:1] offset:4
+; GFX942-NEXT:    s_endpgm
   %r = udiv <3 x i15> %x, %y
   store <3 x i15> %r, ptr addrspace(1) %out
   ret void
@@ -4917,6 +6315,69 @@ define amdgpu_kernel void @urem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX9-NEXT:    global_store_short v2, v0, s[0:1] offset:4
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_v3i15:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s5, s2, 0x7fff
+; GFX942-NEXT:    s_and_b32 s8, s6, 0x7fff
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s8
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v4, s5
+; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_alignbit_b32 v3, s7, v3, 30
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v5, v1
+; GFX942-NEXT:    s_bfe_u32 s7, s6, 0xf000f
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v6, s7
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX942-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-NEXT:    v_fma_f32 v4, -v5, v1, v4
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX942-NEXT:    v_alignbit_b32 v0, s3, v0, 30
+; GFX942-NEXT:    s_bfe_u32 s3, s2, 0xf000f
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v7, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v8, v6
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, v1
+; GFX942-NEXT:    v_and_b32_e32 v3, 0x7fff, v3
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX942-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v1, v1, s6
+; GFX942-NEXT:    v_sub_u32_e32 v4, s2, v1
+; GFX942-NEXT:    v_mul_f32_e32 v1, v7, v8
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_fma_f32 v7, -v1, v6, v7
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v8, v0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v9, v5
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v7|, v6
+; GFX942-NEXT:    s_lshr_b32 s5, s6, 15
+; GFX942-NEXT:    s_lshr_b32 s4, s2, 15
+; GFX942-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v1, v1, s5
+; GFX942-NEXT:    v_sub_u32_e32 v6, s4, v1
+; GFX942-NEXT:    v_mul_f32_e32 v1, v8, v9
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v7, v1
+; GFX942-NEXT:    v_fma_f32 v1, -v1, v5, v8
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v5
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v1, v1, v3
+; GFX942-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
+; GFX942-NEXT:    v_and_b32_e32 v4, 0x7fff, v6
+; GFX942-NEXT:    v_sub_u32_e32 v0, v0, v1
+; GFX942-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
+; GFX942-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
+; GFX942-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX942-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX942-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
+; GFX942-NEXT:    global_store_short v2, v0, s[0:1] offset:4
+; GFX942-NEXT:    s_endpgm
   %r = urem <3 x i15> %x, %y
   store <3 x i15> %r, ptr addrspace(1) %out
   ret void
@@ -5138,6 +6599,74 @@ define amdgpu_kernel void @sdiv_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX9-NEXT:    global_store_short v2, v0, s[0:1] offset:4
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_v3i15:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    s_bfe_i32 s4, s6, 0xf0000
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; GFX942-NEXT:    v_alignbit_b32 v0, s3, v0, 30
+; GFX942-NEXT:    s_bfe_i32 s3, s2, 0xf0000
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v4, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v5, v3
+; GFX942-NEXT:    s_xor_b32 s3, s3, s4
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 30
+; GFX942-NEXT:    s_or_b32 s3, s3, 1
+; GFX942-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX942-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-NEXT:    v_fma_f32 v4, -v5, v3, v4
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v4|, |v3|
+; GFX942-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_bfe_i32 s4, s6, 0xf000f
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; GFX942-NEXT:    s_bfe_i32 s2, s2, 0xf000f
+; GFX942-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-NEXT:    v_add_u32_e32 v4, s3, v5
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v5, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v6, v3
+; GFX942-NEXT:    v_alignbit_b32 v1, s7, v1, 30
+; GFX942-NEXT:    s_xor_b32 s2, s2, s4
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    v_mul_f32_e32 v6, v5, v6
+; GFX942-NEXT:    v_trunc_f32_e32 v6, v6
+; GFX942-NEXT:    v_fma_f32 v5, -v6, v3, v5
+; GFX942-NEXT:    v_bfe_i32 v1, v1, 0, 15
+; GFX942-NEXT:    s_or_b32 s4, s2, 1
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v6, v6
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v5|, |v3|
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v3, v1
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-NEXT:    v_bfe_i32 v0, v0, 0, 15
+; GFX942-NEXT:    v_add_u32_e32 v5, s2, v6
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v6, v0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v7, v3
+; GFX942-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX942-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
+; GFX942-NEXT:    v_or_b32_e32 v0, 1, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, v6, v7
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v7, v1
+; GFX942-NEXT:    v_fma_f32 v1, -v1, v3, v6
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v3|
+; GFX942-NEXT:    v_and_b32_e32 v3, 0x7fff, v4
+; GFX942-NEXT:    v_and_b32_e32 v4, 0x7fff, v5
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-NEXT:    v_add_u32_e32 v0, v7, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
+; GFX942-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
+; GFX942-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX942-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX942-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
+; GFX942-NEXT:    global_store_short v2, v0, s[0:1] offset:4
+; GFX942-NEXT:    s_endpgm
   %r = sdiv <3 x i15> %x, %y
   store <3 x i15> %r, ptr addrspace(1) %out
   ret void
@@ -5385,6 +6914,84 @@ define amdgpu_kernel void @srem_v3i15(ptr addrspace(1) %out, <3 x i15> %x, <3 x
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
 ; GFX9-NEXT:    global_store_short v2, v0, s[0:1] offset:4
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_v3i15:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    v_alignbit_b32 v0, s3, v0, 30
+; GFX942-NEXT:    s_bfe_i32 s3, s6, 0xf0000
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v5, s3
+; GFX942-NEXT:    s_bfe_i32 s4, s2, 0xf0000
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v6, s4
+; GFX942-NEXT:    s_xor_b32 s3, s4, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v7, v5
+; GFX942-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 30
+; GFX942-NEXT:    s_lshr_b32 s8, s2, 15
+; GFX942-NEXT:    v_mul_f32_e32 v7, v6, v7
+; GFX942-NEXT:    v_trunc_f32_e32 v7, v7
+; GFX942-NEXT:    v_fma_f32 v6, -v7, v5, v6
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v7, v7
+; GFX942-NEXT:    v_alignbit_b32 v1, s7, v1, 30
+; GFX942-NEXT:    s_lshr_b32 s7, s6, 15
+; GFX942-NEXT:    s_or_b32 s3, s3, 1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v6|, |v5|
+; GFX942-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    v_add_u32_e32 v5, s3, v7
+; GFX942-NEXT:    s_bfe_i32 s3, s6, 0xf000f
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v6, s3
+; GFX942-NEXT:    v_mul_lo_u32 v5, v5, s6
+; GFX942-NEXT:    v_sub_u32_e32 v5, s2, v5
+; GFX942-NEXT:    s_bfe_i32 s2, s2, 0xf000f
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v7, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v8, v6
+; GFX942-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-NEXT:    v_and_b32_e32 v4, 0x7fff, v1
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    v_mul_f32_e32 v8, v7, v8
+; GFX942-NEXT:    v_trunc_f32_e32 v8, v8
+; GFX942-NEXT:    v_fma_f32 v7, -v8, v6, v7
+; GFX942-NEXT:    v_bfe_i32 v1, v1, 0, 15
+; GFX942-NEXT:    s_or_b32 s4, s2, 1
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v8, v8
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v7|, |v6|
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v7, v1
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    v_and_b32_e32 v3, 0x7fff, v0
+; GFX942-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-NEXT:    v_bfe_i32 v0, v0, 0, 15
+; GFX942-NEXT:    v_add_u32_e32 v6, s2, v8
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v8, v0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v9, v7
+; GFX942-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX942-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
+; GFX942-NEXT:    v_or_b32_e32 v0, 1, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, v8, v9
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v9, v1
+; GFX942-NEXT:    v_fma_f32 v1, -v1, v7, v8
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, |v7|
+; GFX942-NEXT:    v_mul_lo_u32 v6, v6, s7
+; GFX942-NEXT:    v_sub_u32_e32 v6, s8, v6
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-NEXT:    v_add_u32_e32 v0, v9, v0
+; GFX942-NEXT:    v_mul_lo_u32 v0, v0, v4
+; GFX942-NEXT:    v_and_b32_e32 v4, 0x7fff, v6
+; GFX942-NEXT:    v_sub_u32_e32 v0, v3, v0
+; GFX942-NEXT:    v_and_b32_e32 v3, 0x7fff, v5
+; GFX942-NEXT:    v_lshlrev_b32_e32 v4, 15, v4
+; GFX942-NEXT:    v_lshlrev_b64 v[0:1], 30, v[0:1]
+; GFX942-NEXT:    v_or_b32_e32 v3, v3, v4
+; GFX942-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX942-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x1fff, v1
+; GFX942-NEXT:    global_store_short v2, v0, s[0:1] offset:4
+; GFX942-NEXT:    s_endpgm
   %r = srem <3 x i15> %x, %y
   store <3 x i15> %r, ptr addrspace(1) %out
   ret void
@@ -5426,6 +7033,21 @@ define amdgpu_kernel void @udiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_i32_oddk_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_hi_u32 s3, s2, 0xb2a50881
+; GFX942-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX942-NEXT:    s_add_i32 s2, s2, s3
+; GFX942-NEXT:    s_lshr_b32 s2, s2, 20
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = udiv i32 %x, 1235195
   store i32 %r, ptr addrspace(1) %out
   ret void
@@ -5459,6 +7081,17 @@ define amdgpu_kernel void @udiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_i32_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s2, s2, 12
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = udiv i32 %x, 4096
   store i32 %r, ptr addrspace(1) %out
   ret void
@@ -5495,6 +7128,17 @@ define amdgpu_kernel void @udiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_i32_pow2_shl_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_add_i32 s3, s3, 12
+; GFX942-NEXT:    s_lshr_b32 s2, s2, s3
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = udiv i32 %x, %shl.y
   store i32 %r, ptr addrspace(1) %out
@@ -5538,6 +7182,18 @@ define amdgpu_kernel void @udiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_v2i32_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s2, s2, 12
+; GFX942-NEXT:    s_lshr_b32 s3, s3, 12
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
   ret void
@@ -5588,6 +7244,22 @@ define amdgpu_kernel void @udiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out, <
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_v2i32_mixed_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_hi_u32 s4, s3, 0x100101
+; GFX942-NEXT:    s_sub_i32 s3, s3, s4
+; GFX942-NEXT:    s_lshr_b32 s3, s3, 1
+; GFX942-NEXT:    s_add_i32 s3, s3, s4
+; GFX942-NEXT:    s_lshr_b32 s2, s2, 12
+; GFX942-NEXT:    s_lshr_b32 s3, s3, 11
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = udiv <2 x i32> %x, <i32 4096, i32 4095>
   store <2 x i32> %r, ptr addrspace(1) %out
   ret void
@@ -5772,6 +7444,60 @@ define amdgpu_kernel void @udiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_v2i32_pow2_shl_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s7, 0x1000, s2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX942-NEXT:    s_lshl_b32 s6, 0x1000, s3
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_sub_i32 s4, 0, s7
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-NEXT:    s_mul_hi_u32 s4, s0, s5
+; GFX942-NEXT:    s_mul_i32 s5, s4, s7
+; GFX942-NEXT:    s_sub_i32 s0, s0, s5
+; GFX942-NEXT:    s_add_i32 s9, s4, 1
+; GFX942-NEXT:    s_sub_i32 s5, s0, s7
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX942-NEXT:    s_cselect_b32 s4, s9, s4
+; GFX942-NEXT:    s_cselect_b32 s0, s5, s0
+; GFX942-NEXT:    s_add_i32 s5, s4, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX942-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX942-NEXT:    s_cselect_b32 s0, s5, s4
+; GFX942-NEXT:    s_sub_i32 s4, 0, s6
+; GFX942-NEXT:    s_mul_i32 s4, s4, s8
+; GFX942-NEXT:    s_mul_hi_u32 s4, s8, s4
+; GFX942-NEXT:    s_add_i32 s8, s8, s4
+; GFX942-NEXT:    s_mul_hi_u32 s4, s1, s8
+; GFX942-NEXT:    s_mul_i32 s5, s4, s6
+; GFX942-NEXT:    s_sub_i32 s1, s1, s5
+; GFX942-NEXT:    s_add_i32 s7, s4, 1
+; GFX942-NEXT:    s_sub_i32 s5, s1, s6
+; GFX942-NEXT:    s_cmp_ge_u32 s1, s6
+; GFX942-NEXT:    s_cselect_b32 s4, s7, s4
+; GFX942-NEXT:    s_cselect_b32 s1, s5, s1
+; GFX942-NEXT:    s_add_i32 s5, s4, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s1, s6
+; GFX942-NEXT:    s_cselect_b32 s1, s5, s4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = udiv <2 x i32> %x, %shl.y
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -5819,6 +7545,23 @@ define amdgpu_kernel void @urem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_i32_oddk_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_hi_u32 s3, s2, 0xb2a50881
+; GFX942-NEXT:    s_sub_i32 s4, s2, s3
+; GFX942-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX942-NEXT:    s_add_i32 s4, s4, s3
+; GFX942-NEXT:    s_lshr_b32 s3, s4, 20
+; GFX942-NEXT:    s_mul_i32 s3, s3, 0x12d8fb
+; GFX942-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = urem i32 %x, 1235195
   store i32 %r, ptr addrspace(1) %out
   ret void
@@ -5852,6 +7595,17 @@ define amdgpu_kernel void @urem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_i32_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s2, s2, 0xfff
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = urem i32 %x, 4096
   store i32 %r, ptr addrspace(1) %out
   ret void
@@ -5890,6 +7644,18 @@ define amdgpu_kernel void @urem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_i32_pow2_shl_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s3, 0x1000, s3
+; GFX942-NEXT:    s_add_i32 s3, s3, -1
+; GFX942-NEXT:    s_and_b32 s2, s2, s3
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = urem i32 %x, %shl.y
   store i32 %r, ptr addrspace(1) %out
@@ -5933,6 +7699,18 @@ define amdgpu_kernel void @urem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_v2i32_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s2, s2, 0xfff
+; GFX942-NEXT:    s_and_b32 s3, s3, 0xfff
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = urem <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
   ret void
@@ -6103,6 +7881,56 @@ define amdgpu_kernel void @urem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_v2i32_pow2_shl_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s7, 0x1000, s2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX942-NEXT:    s_lshl_b32 s6, 0x1000, s3
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s6
+; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_sub_i32 s4, 0, s7
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-NEXT:    s_mul_hi_u32 s4, s0, s5
+; GFX942-NEXT:    s_mul_i32 s4, s4, s7
+; GFX942-NEXT:    s_sub_i32 s0, s0, s4
+; GFX942-NEXT:    s_sub_i32 s4, s0, s7
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX942-NEXT:    s_cselect_b32 s0, s4, s0
+; GFX942-NEXT:    s_sub_i32 s4, s0, s7
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX942-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX942-NEXT:    s_cselect_b32 s0, s4, s0
+; GFX942-NEXT:    s_sub_i32 s4, 0, s6
+; GFX942-NEXT:    s_mul_i32 s4, s4, s8
+; GFX942-NEXT:    s_mul_hi_u32 s4, s8, s4
+; GFX942-NEXT:    s_add_i32 s8, s8, s4
+; GFX942-NEXT:    s_mul_hi_u32 s4, s1, s8
+; GFX942-NEXT:    s_mul_i32 s4, s4, s6
+; GFX942-NEXT:    s_sub_i32 s1, s1, s4
+; GFX942-NEXT:    s_sub_i32 s4, s1, s6
+; GFX942-NEXT:    s_cmp_ge_u32 s1, s6
+; GFX942-NEXT:    s_cselect_b32 s1, s4, s1
+; GFX942-NEXT:    s_sub_i32 s4, s1, s6
+; GFX942-NEXT:    s_cmp_ge_u32 s1, s6
+; GFX942-NEXT:    s_cselect_b32 s1, s4, s1
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = urem <2 x i32> %x, %shl.y
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6145,6 +7973,21 @@ define amdgpu_kernel void @sdiv_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_i32_oddk_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_hi_i32 s3, s2, 0xd9528441
+; GFX942-NEXT:    s_add_i32 s3, s3, s2
+; GFX942-NEXT:    s_lshr_b32 s2, s3, 31
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 20
+; GFX942-NEXT:    s_add_i32 s2, s3, s2
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv i32 %x, 1235195
   store i32 %r, ptr addrspace(1) %out
   ret void
@@ -6184,6 +8027,20 @@ define amdgpu_kernel void @sdiv_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_i32_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-NEXT:    s_lshr_b32 s3, s3, 20
+; GFX942-NEXT:    s_add_i32 s2, s2, s3
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 12
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv i32 %x, 4096
   store i32 %r, ptr addrspace(1) %out
   ret void
@@ -6277,6 +8134,46 @@ define amdgpu_kernel void @sdiv_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_i32_pow2_shl_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s3, 0x1000, s3
+; GFX942-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX942-NEXT:    s_add_i32 s3, s3, s4
+; GFX942-NEXT:    s_xor_b32 s3, s3, s4
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT:    s_sub_i32 s6, 0, s3
+; GFX942-NEXT:    s_ashr_i32 s5, s2, 31
+; GFX942-NEXT:    s_add_i32 s2, s2, s5
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_xor_b32 s2, s2, s5
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX942-NEXT:    s_mul_i32 s6, s6, s7
+; GFX942-NEXT:    s_mul_hi_u32 s6, s7, s6
+; GFX942-NEXT:    s_add_i32 s7, s7, s6
+; GFX942-NEXT:    s_mul_hi_u32 s6, s2, s7
+; GFX942-NEXT:    s_mul_i32 s8, s6, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s8
+; GFX942-NEXT:    s_add_i32 s7, s6, 1
+; GFX942-NEXT:    s_sub_i32 s8, s2, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX942-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX942-NEXT:    s_cselect_b32 s2, s8, s2
+; GFX942-NEXT:    s_add_i32 s7, s6, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX942-NEXT:    s_cselect_b32 s2, s7, s6
+; GFX942-NEXT:    s_xor_b32 s3, s5, s4
+; GFX942-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = sdiv i32 %x, %shl.y
   store i32 %r, ptr addrspace(1) %out
@@ -6332,6 +8229,24 @@ define amdgpu_kernel void @sdiv_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_v2i32_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX942-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX942-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX942-NEXT:    s_lshr_b32 s5, s5, 20
+; GFX942-NEXT:    s_add_i32 s2, s2, s4
+; GFX942-NEXT:    s_add_i32 s3, s3, s5
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 12
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 12
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
   ret void
@@ -6388,6 +8303,25 @@ define amdgpu_kernel void @ssdiv_v2i32_mixed_pow2k_denom(ptr addrspace(1) %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s4
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: ssdiv_v2i32_mixed_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX942-NEXT:    s_mul_hi_i32 s5, s3, 0x80080081
+; GFX942-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX942-NEXT:    s_add_i32 s5, s5, s3
+; GFX942-NEXT:    s_add_i32 s2, s2, s4
+; GFX942-NEXT:    s_lshr_b32 s3, s5, 31
+; GFX942-NEXT:    s_ashr_i32 s4, s5, 11
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 12
+; GFX942-NEXT:    s_add_i32 s4, s4, s3
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s4
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv <2 x i32> %x, <i32 4096, i32 4095>
   store <2 x i32> %r, ptr addrspace(1) %out
   ret void
@@ -6614,6 +8548,73 @@ define amdgpu_kernel void @sdiv_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_v2i32_pow2_shl_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s2, 0x1000, s2
+; GFX942-NEXT:    s_abs_i32 s6, s2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-NEXT:    s_lshl_b32 s7, 0x1000, s3
+; GFX942-NEXT:    s_abs_i32 s3, s0
+; GFX942-NEXT:    s_xor_b32 s0, s0, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_sub_i32 s2, 0, s6
+; GFX942-NEXT:    s_ashr_i32 s0, s0, 31
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX942-NEXT:    s_mul_i32 s2, s2, s8
+; GFX942-NEXT:    s_mul_hi_u32 s2, s8, s2
+; GFX942-NEXT:    s_add_i32 s8, s8, s2
+; GFX942-NEXT:    s_mul_hi_u32 s2, s3, s8
+; GFX942-NEXT:    s_mul_i32 s8, s2, s6
+; GFX942-NEXT:    s_sub_i32 s3, s3, s8
+; GFX942-NEXT:    s_add_i32 s9, s2, 1
+; GFX942-NEXT:    s_sub_i32 s8, s3, s6
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX942-NEXT:    s_cselect_b32 s2, s9, s2
+; GFX942-NEXT:    s_cselect_b32 s3, s8, s3
+; GFX942-NEXT:    s_add_i32 s8, s2, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX942-NEXT:    s_cselect_b32 s6, s8, s2
+; GFX942-NEXT:    s_abs_i32 s8, s7
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s8
+; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-NEXT:    s_xor_b32 s5, s6, s0
+; GFX942-NEXT:    s_sub_i32 s6, 0, s8
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_sub_i32 s0, s5, s0
+; GFX942-NEXT:    s_xor_b32 s4, s1, s7
+; GFX942-NEXT:    s_abs_i32 s1, s1
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_ashr_i32 s4, s4, 31
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s6, s6, s5
+; GFX942-NEXT:    s_mul_hi_u32 s6, s5, s6
+; GFX942-NEXT:    s_add_i32 s5, s5, s6
+; GFX942-NEXT:    s_mul_hi_u32 s5, s1, s5
+; GFX942-NEXT:    s_mul_i32 s6, s5, s8
+; GFX942-NEXT:    s_sub_i32 s1, s1, s6
+; GFX942-NEXT:    s_add_i32 s7, s5, 1
+; GFX942-NEXT:    s_sub_i32 s6, s1, s8
+; GFX942-NEXT:    s_cmp_ge_u32 s1, s8
+; GFX942-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX942-NEXT:    s_cselect_b32 s1, s6, s1
+; GFX942-NEXT:    s_add_i32 s6, s5, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s1, s8
+; GFX942-NEXT:    s_cselect_b32 s1, s6, s5
+; GFX942-NEXT:    s_xor_b32 s1, s1, s4
+; GFX942-NEXT:    s_sub_i32 s1, s1, s4
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[2:3]
+; GFX942-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = sdiv <2 x i32> %x, %shl.y
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -6662,6 +8663,23 @@ define amdgpu_kernel void @srem_i32_oddk_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_i32_oddk_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_hi_i32 s3, s2, 0xd9528441
+; GFX942-NEXT:    s_add_i32 s3, s3, s2
+; GFX942-NEXT:    s_lshr_b32 s4, s3, 31
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 20
+; GFX942-NEXT:    s_add_i32 s3, s3, s4
+; GFX942-NEXT:    s_mul_i32 s3, s3, 0x12d8fb
+; GFX942-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = srem i32 %x, 1235195
   store i32 %r, ptr addrspace(1) %out
   ret void
@@ -6703,6 +8721,21 @@ define amdgpu_kernel void @srem_i32_pow2k_denom(ptr addrspace(1) %out, i32 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_i32_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-NEXT:    s_lshr_b32 s3, s3, 20
+; GFX942-NEXT:    s_add_i32 s3, s2, s3
+; GFX942-NEXT:    s_and_b32 s3, s3, 0xfffff000
+; GFX942-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = srem i32 %x, 4096
   store i32 %r, ptr addrspace(1) %out
   ret void
@@ -6787,6 +8820,43 @@ define amdgpu_kernel void @srem_i32_pow2_shl_denom(ptr addrspace(1) %out, i32 %x
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_i32_pow2_shl_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s3, 0x1000, s3
+; GFX942-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX942-NEXT:    s_add_i32 s3, s3, s4
+; GFX942-NEXT:    s_xor_b32 s3, s3, s4
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT:    s_sub_i32 s5, 0, s3
+; GFX942-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX942-NEXT:    s_add_i32 s2, s2, s4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_xor_b32 s2, s2, s4
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX942-NEXT:    s_mul_i32 s5, s5, s6
+; GFX942-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX942-NEXT:    s_add_i32 s6, s6, s5
+; GFX942-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX942-NEXT:    s_mul_i32 s5, s5, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s5
+; GFX942-NEXT:    s_sub_i32 s5, s2, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-NEXT:    s_sub_i32 s5, s2, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s3
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-NEXT:    s_xor_b32 s2, s2, s4
+; GFX942-NEXT:    s_sub_i32 s2, s2, s4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
   %shl.y = shl i32 4096, %y
   %r = srem i32 %x, %shl.y
   store i32 %r, ptr addrspace(1) %out
@@ -6846,6 +8916,26 @@ define amdgpu_kernel void @srem_v2i32_pow2k_denom(ptr addrspace(1) %out, <2 x i3
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_v2i32_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX942-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX942-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX942-NEXT:    s_lshr_b32 s5, s5, 20
+; GFX942-NEXT:    s_add_i32 s4, s2, s4
+; GFX942-NEXT:    s_add_i32 s5, s3, s5
+; GFX942-NEXT:    s_and_b32 s4, s4, 0xfffff000
+; GFX942-NEXT:    s_sub_i32 s2, s2, s4
+; GFX942-NEXT:    s_and_b32 s4, s5, 0xfffff000
+; GFX942-NEXT:    s_sub_i32 s3, s3, s4
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = srem <2 x i32> %x, <i32 4096, i32 4096>
   store <2 x i32> %r, ptr addrspace(1) %out
   ret void
@@ -7052,6 +9142,67 @@ define amdgpu_kernel void @srem_v2i32_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_v2i32_pow2_shl_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x2c
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s2, 0x1000, s2
+; GFX942-NEXT:    s_abs_i32 s2, s2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-NEXT:    s_sub_i32 s7, 0, s2
+; GFX942-NEXT:    s_ashr_i32 s6, s0, 31
+; GFX942-NEXT:    s_abs_i32 s0, s0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_lshl_b32 s3, 0x1000, s3
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s8, v0
+; GFX942-NEXT:    s_mul_i32 s7, s7, s8
+; GFX942-NEXT:    s_mul_hi_u32 s7, s8, s7
+; GFX942-NEXT:    s_add_i32 s8, s8, s7
+; GFX942-NEXT:    s_mul_hi_u32 s7, s0, s8
+; GFX942-NEXT:    s_mul_i32 s7, s7, s2
+; GFX942-NEXT:    s_sub_i32 s0, s0, s7
+; GFX942-NEXT:    s_sub_i32 s7, s0, s2
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s2
+; GFX942-NEXT:    s_cselect_b32 s0, s7, s0
+; GFX942-NEXT:    s_sub_i32 s7, s0, s2
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s2
+; GFX942-NEXT:    s_cselect_b32 s0, s7, s0
+; GFX942-NEXT:    s_abs_i32 s7, s3
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX942-NEXT:    s_xor_b32 s0, s0, s6
+; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-NEXT:    s_sub_i32 s5, 0, s7
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_sub_i32 s0, s0, s6
+; GFX942-NEXT:    s_ashr_i32 s4, s1, 31
+; GFX942-NEXT:    s_abs_i32 s1, s1
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX942-NEXT:    s_mul_i32 s5, s5, s6
+; GFX942-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX942-NEXT:    s_add_i32 s6, s6, s5
+; GFX942-NEXT:    s_mul_hi_u32 s5, s1, s6
+; GFX942-NEXT:    s_mul_i32 s5, s5, s7
+; GFX942-NEXT:    s_sub_i32 s1, s1, s5
+; GFX942-NEXT:    s_sub_i32 s5, s1, s7
+; GFX942-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX942-NEXT:    s_cselect_b32 s1, s5, s1
+; GFX942-NEXT:    s_sub_i32 s5, s1, s7
+; GFX942-NEXT:    s_cmp_ge_u32 s1, s7
+; GFX942-NEXT:    s_cselect_b32 s1, s5, s1
+; GFX942-NEXT:    s_xor_b32 s1, s1, s4
+; GFX942-NEXT:    s_sub_i32 s1, s1, s4
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[2:3]
+; GFX942-NEXT:    s_endpgm
   %shl.y = shl <2 x i32> <i32 4096, i32 4096>, %y
   %r = srem <2 x i32> %x, %shl.y
   store <2 x i32> %r, ptr addrspace(1) %out
@@ -7118,6 +9269,31 @@ define amdgpu_kernel void @udiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_i64_oddk_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_hi_u32 s4, s2, 0x38f83e5
+; GFX942-NEXT:    s_mul_i32 s5, s2, 0x38f83e5
+; GFX942-NEXT:    s_mul_i32 s7, s3, 0x64c139ef
+; GFX942-NEXT:    s_mul_hi_u32 s2, s2, 0x64c139ef
+; GFX942-NEXT:    s_mul_hi_u32 s6, s3, 0x64c139ef
+; GFX942-NEXT:    s_add_u32 s2, s7, s2
+; GFX942-NEXT:    s_addc_u32 s6, s6, 0
+; GFX942-NEXT:    s_add_u32 s2, s5, s2
+; GFX942-NEXT:    s_addc_u32 s2, s4, 0
+; GFX942-NEXT:    s_add_u32 s2, s6, s2
+; GFX942-NEXT:    s_addc_u32 s4, 0, 0
+; GFX942-NEXT:    s_mul_hi_u32 s5, s3, 0x38f83e5
+; GFX942-NEXT:    s_mul_i32 s3, s3, 0x38f83e5
+; GFX942-NEXT:    s_add_u32 s2, s3, s2
+; GFX942-NEXT:    s_addc_u32 s2, s5, s4
+; GFX942-NEXT:    s_lshr_b32 s2, s2, 2
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = udiv i64 %x, 1235195949943
   store i64 %r, ptr addrspace(1) %out
   ret void
@@ -7153,6 +9329,16 @@ define amdgpu_kernel void @udiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_i64_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = udiv i64 %x, 4096
   store i64 %r, ptr addrspace(1) %out
   ret void
@@ -7193,6 +9379,18 @@ define amdgpu_kernel void @udiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_i64_pow2_shl_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_add_i32 s6, s6, 12
+; GFX942-NEXT:    s_lshr_b64 s[2:3], s[2:3], s6
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %shl.y = shl i64 4096, %y
   %r = udiv i64 %x, %shl.y
   store i64 %r, ptr addrspace(1) %out
@@ -7240,6 +9438,21 @@ define amdgpu_kernel void @udiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_v2i64_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b64 s[0:1], s[0:1], 12
+; GFX942-NEXT:    s_lshr_b64 s[2:3], s[2:3], 12
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v5, s3
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %r = udiv <2 x i64> %x, <i64 4096, i64 4096>
   store <2 x i64> %r, ptr addrspace(1) %out
   ret void
@@ -7327,6 +9540,41 @@ define amdgpu_kernel void @udiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out, <
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_v2i64_mixed_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b64 s[0:1], s[0:1], 12
+; GFX942-NEXT:    s_mul_i32 s9, s3, 0x10010011
+; GFX942-NEXT:    s_mul_hi_u32 s10, s2, 0x10010011
+; GFX942-NEXT:    s_mul_hi_u32 s8, s3, 0x10010011
+; GFX942-NEXT:    s_add_u32 s9, s9, s10
+; GFX942-NEXT:    s_mul_i32 s5, s2, 0x100100
+; GFX942-NEXT:    s_addc_u32 s8, s8, 0
+; GFX942-NEXT:    s_mul_hi_u32 s4, s2, 0x100100
+; GFX942-NEXT:    s_add_u32 s5, s5, s9
+; GFX942-NEXT:    s_addc_u32 s4, s4, 0
+; GFX942-NEXT:    s_add_u32 s4, s8, s4
+; GFX942-NEXT:    s_addc_u32 s5, 0, 0
+; GFX942-NEXT:    s_mul_i32 s9, s3, 0x100100
+; GFX942-NEXT:    s_mul_hi_u32 s8, s3, 0x100100
+; GFX942-NEXT:    s_add_u32 s4, s9, s4
+; GFX942-NEXT:    s_addc_u32 s5, s8, s5
+; GFX942-NEXT:    s_sub_u32 s2, s2, s4
+; GFX942-NEXT:    s_subb_u32 s3, s3, s5
+; GFX942-NEXT:    s_lshr_b64 s[2:3], s[2:3], 1
+; GFX942-NEXT:    s_add_u32 s2, s2, s4
+; GFX942-NEXT:    s_addc_u32 s3, s3, s5
+; GFX942-NEXT:    s_lshr_b64 s[2:3], s[2:3], 11
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v5, s3
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %r = udiv <2 x i64> %x, <i64 4096, i64 4095>
   store <2 x i64> %r, ptr addrspace(1) %out
   ret void
@@ -7380,6 +9628,23 @@ define amdgpu_kernel void @udiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: udiv_v2i64_pow2_shl_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_add_i32 s2, s12, 12
+; GFX942-NEXT:    s_add_i32 s4, s14, 12
+; GFX942-NEXT:    s_lshr_b64 s[2:3], s[8:9], s2
+; GFX942-NEXT:    s_lshr_b64 s[4:5], s[10:11], s4
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_mov_b32_e32 v5, s5
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = udiv <2 x i64> %x, %shl.y
   store <2 x i64> %r, ptr addrspace(1) %out
@@ -7461,6 +9726,37 @@ define amdgpu_kernel void @urem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_i64_oddk_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_i32 s7, s3, 0xf6841139
+; GFX942-NEXT:    s_mul_hi_u32 s8, s2, 0xf6841139
+; GFX942-NEXT:    s_mul_hi_u32 s6, s3, 0xf6841139
+; GFX942-NEXT:    s_add_u32 s7, s7, s8
+; GFX942-NEXT:    s_mul_i32 s5, s2, 0xe3e10011
+; GFX942-NEXT:    s_addc_u32 s6, s6, 0
+; GFX942-NEXT:    s_mul_hi_u32 s4, s2, 0xe3e10011
+; GFX942-NEXT:    s_add_u32 s5, s5, s7
+; GFX942-NEXT:    s_addc_u32 s4, s4, 0
+; GFX942-NEXT:    s_add_u32 s4, s6, s4
+; GFX942-NEXT:    s_addc_u32 s5, 0, 0
+; GFX942-NEXT:    s_mul_i32 s7, s3, 0xe3e10011
+; GFX942-NEXT:    s_mul_hi_u32 s6, s3, 0xe3e10011
+; GFX942-NEXT:    s_add_u32 s4, s7, s4
+; GFX942-NEXT:    s_addc_u32 s4, s6, s5
+; GFX942-NEXT:    s_lshr_b32 s4, s4, 8
+; GFX942-NEXT:    s_mul_i32 s5, s4, 0x11f
+; GFX942-NEXT:    s_mul_hi_u32 s6, s4, 0x9761f7c9
+; GFX942-NEXT:    s_add_i32 s6, s6, s5
+; GFX942-NEXT:    s_mul_i32 s4, s4, 0x9761f7c9
+; GFX942-NEXT:    s_sub_u32 s2, s2, s4
+; GFX942-NEXT:    s_subb_u32 s3, s3, s6
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = urem i64 %x, 1235195393993
   store i64 %r, ptr addrspace(1) %out
   ret void
@@ -7495,6 +9791,16 @@ define amdgpu_kernel void @urem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_i64_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s2, s2, 0xfff
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = urem i64 %x, 4096
   store i64 %r, ptr addrspace(1) %out
   ret void
@@ -7539,6 +9845,20 @@ define amdgpu_kernel void @urem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_i64_pow2_shl_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 0x1000, s6
+; GFX942-NEXT:    s_add_u32 s4, s4, -1
+; GFX942-NEXT:    s_addc_u32 s5, s5, -1
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], s[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %shl.y = shl i64 4096, %y
   %r = urem i64 %x, %shl.y
   store i64 %r, ptr addrspace(1) %out
@@ -7585,6 +9905,20 @@ define amdgpu_kernel void @urem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6
 ; GFX9-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX9-NEXT:    global_store_dwordx4 v1, v[0:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_v2i64_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s0, s0, 0xfff
+; GFX942-NEXT:    s_and_b32 s1, s2, 0xfff
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NEXT:    global_store_dwordx4 v1, v[0:3], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %r = urem <2 x i64> %x, <i64 4096, i64 4096>
   store <2 x i64> %r, ptr addrspace(1) %out
   ret void
@@ -7646,6 +9980,27 @@ define amdgpu_kernel void @urem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: urem_v2i64_pow2_shl_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 0x1000, s14
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 0x1000, s12
+; GFX942-NEXT:    s_add_u32 s4, s4, -1
+; GFX942-NEXT:    s_addc_u32 s5, s5, -1
+; GFX942-NEXT:    s_and_b64 s[4:5], s[8:9], s[4:5]
+; GFX942-NEXT:    s_add_u32 s2, s2, -1
+; GFX942-NEXT:    s_addc_u32 s3, s3, -1
+; GFX942-NEXT:    s_and_b64 s[2:3], s[10:11], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-NEXT:    v_mov_b32_e32 v3, s5
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v5, s3
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = urem <2 x i64> %x, %shl.y
   store <2 x i64> %r, ptr addrspace(1) %out
@@ -7734,6 +10089,42 @@ define amdgpu_kernel void @sdiv_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_i64_oddk_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_hi_u32 s4, s2, 0x6ca94220
+; GFX942-NEXT:    s_mul_i32 s5, s2, 0x6ca94220
+; GFX942-NEXT:    s_mul_i32 s7, s3, 0xfd81e19
+; GFX942-NEXT:    s_mul_hi_u32 s2, s2, 0xfd81e19
+; GFX942-NEXT:    s_mul_hi_u32 s6, s3, 0xfd81e19
+; GFX942-NEXT:    s_add_u32 s2, s7, s2
+; GFX942-NEXT:    s_addc_u32 s6, s6, 0
+; GFX942-NEXT:    s_add_u32 s2, s5, s2
+; GFX942-NEXT:    s_addc_u32 s2, s4, 0
+; GFX942-NEXT:    s_add_u32 s2, s6, s2
+; GFX942-NEXT:    s_addc_u32 s4, 0, 0
+; GFX942-NEXT:    s_mul_i32 s6, s3, 0x6ca94220
+; GFX942-NEXT:    s_mul_hi_u32 s5, s3, 0x6ca94220
+; GFX942-NEXT:    s_add_u32 s2, s6, s2
+; GFX942-NEXT:    s_addc_u32 s4, s5, s4
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-NEXT:    s_mul_i32 s5, s3, 0x6ca94220
+; GFX942-NEXT:    s_mul_hi_u32 s6, s3, 0xfd81e19
+; GFX942-NEXT:    s_add_i32 s5, s6, s5
+; GFX942-NEXT:    s_mul_i32 s3, s3, 0xfd81e19
+; GFX942-NEXT:    s_add_i32 s5, s5, s3
+; GFX942-NEXT:    s_add_u32 s2, s2, s3
+; GFX942-NEXT:    s_addc_u32 s3, s4, s5
+; GFX942-NEXT:    s_ashr_i64 s[4:5], s[2:3], 19
+; GFX942-NEXT:    s_lshr_b32 s2, s3, 31
+; GFX942-NEXT:    s_add_u32 s2, s4, s2
+; GFX942-NEXT:    s_addc_u32 s3, s5, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv i64 %x, 1235195
   store i64 %r, ptr addrspace(1) %out
   ret void
@@ -7777,6 +10168,20 @@ define amdgpu_kernel void @sdiv_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_i64_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX942-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX942-NEXT:    s_add_u32 s2, s2, s4
+; GFX942-NEXT:    s_addc_u32 s3, s3, 0
+; GFX942-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv i64 %x, 4096
   store i64 %r, ptr addrspace(1) %out
   ret void
@@ -8074,6 +10479,158 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v4, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[8:9]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_i64_pow2_shl_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
+; GFX942-NEXT:    s_ashr_i32 s2, s1, 31
+; GFX942-NEXT:    s_add_u32 s0, s0, s2
+; GFX942-NEXT:    s_mov_b32 s3, s2
+; GFX942-NEXT:    s_addc_u32 s1, s1, s2
+; GFX942-NEXT:    s_xor_b64 s[6:7], s[0:1], s[2:3]
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GFX942-NEXT:    s_sub_u32 s0, 0, s6
+; GFX942-NEXT:    s_subb_u32 s1, 0, s7
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_fmamk_f32 v1, v2, 0xcf800000, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-NEXT:    s_mul_i32 s12, s0, s4
+; GFX942-NEXT:    s_mul_hi_u32 s14, s0, s5
+; GFX942-NEXT:    s_mul_i32 s13, s1, s5
+; GFX942-NEXT:    s_add_i32 s12, s14, s12
+; GFX942-NEXT:    s_mul_i32 s15, s0, s5
+; GFX942-NEXT:    s_add_i32 s12, s12, s13
+; GFX942-NEXT:    s_mul_hi_u32 s14, s5, s15
+; GFX942-NEXT:    s_mul_hi_u32 s13, s5, s12
+; GFX942-NEXT:    s_mul_i32 s5, s5, s12
+; GFX942-NEXT:    s_add_u32 s5, s14, s5
+; GFX942-NEXT:    s_addc_u32 s13, 0, s13
+; GFX942-NEXT:    s_mul_hi_u32 s16, s4, s15
+; GFX942-NEXT:    s_mul_i32 s15, s4, s15
+; GFX942-NEXT:    s_add_u32 s5, s5, s15
+; GFX942-NEXT:    s_mul_hi_u32 s14, s4, s12
+; GFX942-NEXT:    s_addc_u32 s5, s13, s16
+; GFX942-NEXT:    s_addc_u32 s13, s14, 0
+; GFX942-NEXT:    s_mul_i32 s12, s4, s12
+; GFX942-NEXT:    s_add_u32 s5, s5, s12
+; GFX942-NEXT:    s_addc_u32 s12, 0, s13
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s5, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s4, s4, s12
+; GFX942-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX942-NEXT:    s_mul_i32 s5, s0, s4
+; GFX942-NEXT:    s_mul_hi_u32 s13, s0, s12
+; GFX942-NEXT:    s_add_i32 s5, s13, s5
+; GFX942-NEXT:    s_mul_i32 s1, s1, s12
+; GFX942-NEXT:    s_add_i32 s5, s5, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s12
+; GFX942-NEXT:    s_mul_hi_u32 s13, s4, s0
+; GFX942-NEXT:    s_mul_i32 s14, s4, s0
+; GFX942-NEXT:    s_mul_i32 s16, s12, s5
+; GFX942-NEXT:    s_mul_hi_u32 s0, s12, s0
+; GFX942-NEXT:    s_mul_hi_u32 s15, s12, s5
+; GFX942-NEXT:    s_add_u32 s0, s0, s16
+; GFX942-NEXT:    s_addc_u32 s12, 0, s15
+; GFX942-NEXT:    s_add_u32 s0, s0, s14
+; GFX942-NEXT:    s_mul_hi_u32 s1, s4, s5
+; GFX942-NEXT:    s_addc_u32 s0, s12, s13
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s5, s4, s5
+; GFX942-NEXT:    s_add_u32 s0, s0, s5
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s12, s4, s1
+; GFX942-NEXT:    s_ashr_i32 s4, s11, 31
+; GFX942-NEXT:    s_add_u32 s0, s10, s4
+; GFX942-NEXT:    s_mov_b32 s5, s4
+; GFX942-NEXT:    s_addc_u32 s1, s11, s4
+; GFX942-NEXT:    s_xor_b64 s[10:11], s[0:1], s[4:5]
+; GFX942-NEXT:    v_readfirstlane_b32 s13, v1
+; GFX942-NEXT:    s_mul_i32 s1, s10, s12
+; GFX942-NEXT:    s_mul_hi_u32 s14, s10, s13
+; GFX942-NEXT:    s_mul_hi_u32 s0, s10, s12
+; GFX942-NEXT:    s_add_u32 s1, s14, s1
+; GFX942-NEXT:    s_addc_u32 s0, 0, s0
+; GFX942-NEXT:    s_mul_hi_u32 s15, s11, s13
+; GFX942-NEXT:    s_mul_i32 s13, s11, s13
+; GFX942-NEXT:    s_add_u32 s1, s1, s13
+; GFX942-NEXT:    s_mul_hi_u32 s14, s11, s12
+; GFX942-NEXT:    s_addc_u32 s0, s0, s15
+; GFX942-NEXT:    s_addc_u32 s1, s14, 0
+; GFX942-NEXT:    s_mul_i32 s12, s11, s12
+; GFX942-NEXT:    s_add_u32 s12, s0, s12
+; GFX942-NEXT:    s_addc_u32 s13, 0, s1
+; GFX942-NEXT:    s_mul_i32 s0, s6, s13
+; GFX942-NEXT:    s_mul_hi_u32 s1, s6, s12
+; GFX942-NEXT:    s_add_i32 s0, s1, s0
+; GFX942-NEXT:    s_mul_i32 s1, s7, s12
+; GFX942-NEXT:    s_add_i32 s14, s0, s1
+; GFX942-NEXT:    s_mul_i32 s1, s6, s12
+; GFX942-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NEXT:    s_sub_i32 s0, s11, s14
+; GFX942-NEXT:    v_sub_co_u32_e32 v1, vcc, s10, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s10, s0, s7
+; GFX942-NEXT:    v_subrev_co_u32_e64 v2, s[0:1], s6, v1
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s10, s10, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s10, s7
+; GFX942-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v2
+; GFX942-NEXT:    s_cmp_eq_u32 s10, s7
+; GFX942-NEXT:    v_mov_b32_e32 v3, s15
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s12, 1
+; GFX942-NEXT:    s_addc_u32 s10, s13, 0
+; GFX942-NEXT:    s_add_u32 s1, s12, 2
+; GFX942-NEXT:    s_addc_u32 s15, s13, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s0
+; GFX942-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s6, v1
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v3, s10
+; GFX942-NEXT:    v_mov_b32_e32 v4, s15
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s11, s14
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s7
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, s13
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX942-NEXT:    s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, s12
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v2, s0, v2
+; GFX942-NEXT:    v_xor_b32_e32 v1, s1, v1
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    s_endpgm
   %shl.y = shl i64 4096, %y
   %r = sdiv i64 %x, %shl.y
   store i64 %r, ptr addrspace(1) %out
@@ -8137,6 +10694,29 @@ define amdgpu_kernel void @sdiv_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_v2i64_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s4, s1, 31
+; GFX942-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX942-NEXT:    s_add_u32 s0, s0, s4
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX942-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
+; GFX942-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX942-NEXT:    s_add_u32 s2, s2, s4
+; GFX942-NEXT:    s_addc_u32 s3, s3, 0
+; GFX942-NEXT:    s_ashr_i64 s[2:3], s[2:3], 12
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v5, s3
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv <2 x i64> %x, <i64 4096, i64 4096>
   store <2 x i64> %r, ptr addrspace(1) %out
   ret void
@@ -8254,6 +10834,55 @@ define amdgpu_kernel void @ssdiv_v2i64_mixed_pow2k_denom(ptr addrspace(1) %out,
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: ssdiv_v2i64_mixed_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s4, s1, 31
+; GFX942-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX942-NEXT:    s_add_u32 s0, s0, s4
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_ashr_i64 s[0:1], s[0:1], 12
+; GFX942-NEXT:    s_mul_i32 s9, s3, 0x8008009
+; GFX942-NEXT:    s_mul_hi_u32 s10, s2, 0x8008009
+; GFX942-NEXT:    s_mul_hi_u32 s4, s3, 0x8008009
+; GFX942-NEXT:    s_add_u32 s9, s9, s10
+; GFX942-NEXT:    s_mul_i32 s8, s2, 0x80080080
+; GFX942-NEXT:    s_addc_u32 s4, s4, 0
+; GFX942-NEXT:    s_mul_hi_u32 s5, s2, 0x80080080
+; GFX942-NEXT:    s_add_u32 s8, s8, s9
+; GFX942-NEXT:    s_addc_u32 s5, s5, 0
+; GFX942-NEXT:    s_add_u32 s4, s4, s5
+; GFX942-NEXT:    s_addc_u32 s5, 0, 0
+; GFX942-NEXT:    s_mul_i32 s9, s3, 0x80080080
+; GFX942-NEXT:    s_mul_hi_u32 s8, s3, 0x80080080
+; GFX942-NEXT:    s_add_u32 s4, s9, s4
+; GFX942-NEXT:    s_addc_u32 s5, s8, s5
+; GFX942-NEXT:    s_ashr_i32 s8, s3, 31
+; GFX942-NEXT:    s_mul_i32 s9, s8, 0x80080080
+; GFX942-NEXT:    s_mul_hi_u32 s10, s8, 0x8008009
+; GFX942-NEXT:    s_add_i32 s9, s10, s9
+; GFX942-NEXT:    s_mul_i32 s8, s8, 0x8008009
+; GFX942-NEXT:    s_add_i32 s9, s9, s8
+; GFX942-NEXT:    s_sub_u32 s8, s8, s2
+; GFX942-NEXT:    s_subb_u32 s9, s9, s3
+; GFX942-NEXT:    s_add_u32 s4, s4, s8
+; GFX942-NEXT:    s_addc_u32 s5, s5, s9
+; GFX942-NEXT:    s_add_u32 s2, s4, s2
+; GFX942-NEXT:    s_addc_u32 s3, s5, s3
+; GFX942-NEXT:    s_ashr_i64 s[4:5], s[2:3], 11
+; GFX942-NEXT:    s_lshr_b32 s2, s3, 31
+; GFX942-NEXT:    s_add_u32 s2, s4, s2
+; GFX942-NEXT:    s_addc_u32 s3, s5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v5, s3
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %r = sdiv <2 x i64> %x, <i64 4096, i64 4095>
   store <2 x i64> %r, ptr addrspace(1) %out
   ret void
@@ -8819,6 +11448,300 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sdiv_v2i64_pow2_shl_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b64 s[0:1], 0x1000, s12
+; GFX942-NEXT:    s_lshl_b64 s[6:7], 0x1000, s14
+; GFX942-NEXT:    s_ashr_i32 s12, s1, 31
+; GFX942-NEXT:    s_add_u32 s0, s0, s12
+; GFX942-NEXT:    s_mov_b32 s13, s12
+; GFX942-NEXT:    s_addc_u32 s1, s1, s12
+; GFX942-NEXT:    s_xor_b64 s[14:15], s[0:1], s[12:13]
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s14
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s15
+; GFX942-NEXT:    s_sub_u32 s0, 0, s14
+; GFX942-NEXT:    s_subb_u32 s1, 0, s15
+; GFX942-NEXT:    v_fmac_f32_e32 v0, 0x4f800000, v1
+; GFX942-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_fmac_f32_e32 v0, 0xcf800000, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s16, s0, s4
+; GFX942-NEXT:    s_mul_hi_u32 s18, s0, s5
+; GFX942-NEXT:    s_mul_i32 s17, s1, s5
+; GFX942-NEXT:    s_add_i32 s16, s18, s16
+; GFX942-NEXT:    s_mul_i32 s19, s0, s5
+; GFX942-NEXT:    s_add_i32 s16, s16, s17
+; GFX942-NEXT:    s_mul_hi_u32 s17, s5, s16
+; GFX942-NEXT:    s_mul_i32 s18, s5, s16
+; GFX942-NEXT:    s_mul_hi_u32 s5, s5, s19
+; GFX942-NEXT:    s_add_u32 s5, s5, s18
+; GFX942-NEXT:    s_addc_u32 s17, 0, s17
+; GFX942-NEXT:    s_mul_hi_u32 s20, s4, s19
+; GFX942-NEXT:    s_mul_i32 s19, s4, s19
+; GFX942-NEXT:    s_add_u32 s5, s5, s19
+; GFX942-NEXT:    s_mul_hi_u32 s18, s4, s16
+; GFX942-NEXT:    s_addc_u32 s5, s17, s20
+; GFX942-NEXT:    s_addc_u32 s17, s18, 0
+; GFX942-NEXT:    s_mul_i32 s16, s4, s16
+; GFX942-NEXT:    s_add_u32 s5, s5, s16
+; GFX942-NEXT:    s_addc_u32 s16, 0, s17
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s4, s4, s16
+; GFX942-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX942-NEXT:    s_mul_i32 s5, s0, s4
+; GFX942-NEXT:    s_mul_hi_u32 s17, s0, s16
+; GFX942-NEXT:    s_add_i32 s5, s17, s5
+; GFX942-NEXT:    s_mul_i32 s1, s1, s16
+; GFX942-NEXT:    s_add_i32 s5, s5, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s16
+; GFX942-NEXT:    s_mul_hi_u32 s17, s4, s0
+; GFX942-NEXT:    s_mul_i32 s18, s4, s0
+; GFX942-NEXT:    s_mul_i32 s20, s16, s5
+; GFX942-NEXT:    s_mul_hi_u32 s0, s16, s0
+; GFX942-NEXT:    s_mul_hi_u32 s19, s16, s5
+; GFX942-NEXT:    s_add_u32 s0, s0, s20
+; GFX942-NEXT:    s_addc_u32 s16, 0, s19
+; GFX942-NEXT:    s_add_u32 s0, s0, s18
+; GFX942-NEXT:    s_mul_hi_u32 s1, s4, s5
+; GFX942-NEXT:    s_addc_u32 s0, s16, s17
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s5, s4, s5
+; GFX942-NEXT:    s_add_u32 s0, s0, s5
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s16, s4, s1
+; GFX942-NEXT:    s_ashr_i32 s4, s9, 31
+; GFX942-NEXT:    s_add_u32 s0, s8, s4
+; GFX942-NEXT:    s_mov_b32 s5, s4
+; GFX942-NEXT:    s_addc_u32 s1, s9, s4
+; GFX942-NEXT:    s_xor_b64 s[8:9], s[0:1], s[4:5]
+; GFX942-NEXT:    v_readfirstlane_b32 s17, v0
+; GFX942-NEXT:    s_mul_i32 s1, s8, s16
+; GFX942-NEXT:    s_mul_hi_u32 s18, s8, s17
+; GFX942-NEXT:    s_mul_hi_u32 s0, s8, s16
+; GFX942-NEXT:    s_add_u32 s1, s18, s1
+; GFX942-NEXT:    s_addc_u32 s0, 0, s0
+; GFX942-NEXT:    s_mul_hi_u32 s19, s9, s17
+; GFX942-NEXT:    s_mul_i32 s17, s9, s17
+; GFX942-NEXT:    s_add_u32 s1, s1, s17
+; GFX942-NEXT:    s_mul_hi_u32 s18, s9, s16
+; GFX942-NEXT:    s_addc_u32 s0, s0, s19
+; GFX942-NEXT:    s_addc_u32 s1, s18, 0
+; GFX942-NEXT:    s_mul_i32 s16, s9, s16
+; GFX942-NEXT:    s_add_u32 s16, s0, s16
+; GFX942-NEXT:    s_addc_u32 s17, 0, s1
+; GFX942-NEXT:    s_mul_i32 s0, s14, s17
+; GFX942-NEXT:    s_mul_hi_u32 s1, s14, s16
+; GFX942-NEXT:    s_add_i32 s0, s1, s0
+; GFX942-NEXT:    s_mul_i32 s1, s15, s16
+; GFX942-NEXT:    s_add_i32 s18, s0, s1
+; GFX942-NEXT:    s_mul_i32 s1, s14, s16
+; GFX942-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NEXT:    s_sub_i32 s0, s9, s18
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, s8, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s8, s0, s15
+; GFX942-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s14, v0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s8, s8, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s8, s15
+; GFX942-NEXT:    s_cselect_b32 s19, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[0:1], s14, v1
+; GFX942-NEXT:    s_cmp_eq_u32 s8, s15
+; GFX942-NEXT:    v_mov_b32_e32 v2, s19
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s16, 1
+; GFX942-NEXT:    s_addc_u32 s8, s17, 0
+; GFX942-NEXT:    s_add_u32 s1, s16, 2
+; GFX942-NEXT:    s_addc_u32 s19, s17, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s14, v0
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v2, v3, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v2, s8
+; GFX942-NEXT:    v_mov_b32_e32 v3, s19
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s9, s18
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s15
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s15
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    s_xor_b64 s[0:1], s[4:5], s[12:13]
+; GFX942-NEXT:    s_ashr_i32 s4, s7, 31
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX942-NEXT:    s_add_u32 s6, s6, s4
+; GFX942-NEXT:    v_mov_b32_e32 v3, s17
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_mov_b32 s5, s4
+; GFX942-NEXT:    s_addc_u32 s7, s7, s4
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, s16
+; GFX942-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, s6
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, s7
+; GFX942-NEXT:    v_xor_b32_e32 v1, s0, v1
+; GFX942-NEXT:    v_xor_b32_e32 v5, s1, v0
+; GFX942-NEXT:    v_subrev_co_u32_e32 v0, vcc, s0, v1
+; GFX942-NEXT:    v_fmac_f32_e32 v2, 0x4f800000, v3
+; GFX942-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX942-NEXT:    s_sub_u32 s0, 0, s6
+; GFX942-NEXT:    v_mov_b32_e32 v6, s1
+; GFX942-NEXT:    s_subb_u32 s1, 0, s7
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX942-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fmac_f32_e32 v2, 0xcf800000, v3
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v5, v6, vcc
+; GFX942-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s13, v3
+; GFX942-NEXT:    s_mul_hi_u32 s12, s0, s8
+; GFX942-NEXT:    s_mul_i32 s14, s0, s13
+; GFX942-NEXT:    s_mul_i32 s9, s1, s8
+; GFX942-NEXT:    s_add_i32 s12, s12, s14
+; GFX942-NEXT:    s_add_i32 s12, s12, s9
+; GFX942-NEXT:    s_mul_i32 s15, s0, s8
+; GFX942-NEXT:    s_mul_hi_u32 s9, s8, s12
+; GFX942-NEXT:    s_mul_i32 s14, s8, s12
+; GFX942-NEXT:    s_mul_hi_u32 s8, s8, s15
+; GFX942-NEXT:    s_add_u32 s8, s8, s14
+; GFX942-NEXT:    s_addc_u32 s9, 0, s9
+; GFX942-NEXT:    s_mul_hi_u32 s16, s13, s15
+; GFX942-NEXT:    s_mul_i32 s15, s13, s15
+; GFX942-NEXT:    s_add_u32 s8, s8, s15
+; GFX942-NEXT:    s_mul_hi_u32 s14, s13, s12
+; GFX942-NEXT:    s_addc_u32 s8, s9, s16
+; GFX942-NEXT:    s_addc_u32 s9, s14, 0
+; GFX942-NEXT:    s_mul_i32 s12, s13, s12
+; GFX942-NEXT:    s_add_u32 s8, s8, s12
+; GFX942-NEXT:    s_addc_u32 s9, 0, s9
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, s8, v2
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s8, s13, s9
+; GFX942-NEXT:    v_readfirstlane_b32 s12, v2
+; GFX942-NEXT:    s_mul_i32 s9, s0, s8
+; GFX942-NEXT:    s_mul_hi_u32 s13, s0, s12
+; GFX942-NEXT:    s_add_i32 s9, s13, s9
+; GFX942-NEXT:    s_mul_i32 s1, s1, s12
+; GFX942-NEXT:    s_add_i32 s9, s9, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s12
+; GFX942-NEXT:    s_mul_hi_u32 s13, s8, s0
+; GFX942-NEXT:    s_mul_i32 s14, s8, s0
+; GFX942-NEXT:    s_mul_i32 s16, s12, s9
+; GFX942-NEXT:    s_mul_hi_u32 s0, s12, s0
+; GFX942-NEXT:    s_mul_hi_u32 s15, s12, s9
+; GFX942-NEXT:    s_add_u32 s0, s0, s16
+; GFX942-NEXT:    s_addc_u32 s12, 0, s15
+; GFX942-NEXT:    s_add_u32 s0, s0, s14
+; GFX942-NEXT:    s_mul_hi_u32 s1, s8, s9
+; GFX942-NEXT:    s_addc_u32 s0, s12, s13
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s9, s8, s9
+; GFX942-NEXT:    s_add_u32 s0, s0, s9
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s12, s8, s1
+; GFX942-NEXT:    s_ashr_i32 s8, s11, 31
+; GFX942-NEXT:    s_add_u32 s0, s10, s8
+; GFX942-NEXT:    s_mov_b32 s9, s8
+; GFX942-NEXT:    s_addc_u32 s1, s11, s8
+; GFX942-NEXT:    s_xor_b64 s[10:11], s[0:1], s[8:9]
+; GFX942-NEXT:    v_readfirstlane_b32 s13, v2
+; GFX942-NEXT:    s_mul_i32 s1, s10, s12
+; GFX942-NEXT:    s_mul_hi_u32 s14, s10, s13
+; GFX942-NEXT:    s_mul_hi_u32 s0, s10, s12
+; GFX942-NEXT:    s_add_u32 s1, s14, s1
+; GFX942-NEXT:    s_addc_u32 s0, 0, s0
+; GFX942-NEXT:    s_mul_hi_u32 s15, s11, s13
+; GFX942-NEXT:    s_mul_i32 s13, s11, s13
+; GFX942-NEXT:    s_add_u32 s1, s1, s13
+; GFX942-NEXT:    s_mul_hi_u32 s14, s11, s12
+; GFX942-NEXT:    s_addc_u32 s0, s0, s15
+; GFX942-NEXT:    s_addc_u32 s1, s14, 0
+; GFX942-NEXT:    s_mul_i32 s12, s11, s12
+; GFX942-NEXT:    s_add_u32 s12, s0, s12
+; GFX942-NEXT:    s_addc_u32 s13, 0, s1
+; GFX942-NEXT:    s_mul_i32 s0, s6, s13
+; GFX942-NEXT:    s_mul_hi_u32 s1, s6, s12
+; GFX942-NEXT:    s_add_i32 s0, s1, s0
+; GFX942-NEXT:    s_mul_i32 s1, s7, s12
+; GFX942-NEXT:    s_add_i32 s14, s0, s1
+; GFX942-NEXT:    s_mul_i32 s1, s6, s12
+; GFX942-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-NEXT:    s_sub_i32 s0, s11, s14
+; GFX942-NEXT:    v_sub_co_u32_e32 v2, vcc, s10, v2
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s10, s0, s7
+; GFX942-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s6, v2
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s10, s10, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s10, s7
+; GFX942-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v3
+; GFX942-NEXT:    s_cmp_eq_u32 s10, s7
+; GFX942-NEXT:    v_mov_b32_e32 v5, s15
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s12, 1
+; GFX942-NEXT:    s_addc_u32 s10, s13, 0
+; GFX942-NEXT:    s_add_u32 s1, s12, 2
+; GFX942-NEXT:    s_addc_u32 s15, s13, 0
+; GFX942-NEXT:    v_mov_b32_e32 v5, s0
+; GFX942-NEXT:    v_mov_b32_e32 v6, s1
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s6, v2
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v5, s10
+; GFX942-NEXT:    v_mov_b32_e32 v6, s15
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s11, s14
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s7
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v6, s1
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v6, s13
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    s_xor_b64 s[0:1], s[8:9], s[4:5]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v6, v5, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, s12
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v3, s0, v3
+; GFX942-NEXT:    v_xor_b32_e32 v5, s1, v2
+; GFX942-NEXT:    v_mov_b32_e32 v6, s1
+; GFX942-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v6, vcc
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[2:3]
+; GFX942-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = sdiv <2 x i64> %x, %shl.y
   store <2 x i64> %r, ptr addrspace(1) %out
@@ -8921,6 +11844,48 @@ define amdgpu_kernel void @srem_i64_oddk_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_i64_oddk_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_i32 s7, s3, 0xfd81e19
+; GFX942-NEXT:    s_mul_hi_u32 s8, s2, 0xfd81e19
+; GFX942-NEXT:    s_mul_hi_u32 s6, s3, 0xfd81e19
+; GFX942-NEXT:    s_add_u32 s7, s7, s8
+; GFX942-NEXT:    s_mul_i32 s5, s2, 0x6ca94220
+; GFX942-NEXT:    s_addc_u32 s6, s6, 0
+; GFX942-NEXT:    s_mul_hi_u32 s4, s2, 0x6ca94220
+; GFX942-NEXT:    s_add_u32 s5, s5, s7
+; GFX942-NEXT:    s_addc_u32 s4, s4, 0
+; GFX942-NEXT:    s_add_u32 s4, s6, s4
+; GFX942-NEXT:    s_addc_u32 s5, 0, 0
+; GFX942-NEXT:    s_mul_i32 s7, s3, 0x6ca94220
+; GFX942-NEXT:    s_mul_hi_u32 s6, s3, 0x6ca94220
+; GFX942-NEXT:    s_add_u32 s4, s7, s4
+; GFX942-NEXT:    s_addc_u32 s5, s6, s5
+; GFX942-NEXT:    s_ashr_i32 s6, s3, 31
+; GFX942-NEXT:    s_mul_i32 s7, s6, 0x6ca94220
+; GFX942-NEXT:    s_mul_hi_u32 s8, s6, 0xfd81e19
+; GFX942-NEXT:    s_add_i32 s7, s8, s7
+; GFX942-NEXT:    s_mul_i32 s6, s6, 0xfd81e19
+; GFX942-NEXT:    s_add_i32 s7, s7, s6
+; GFX942-NEXT:    s_add_u32 s4, s4, s6
+; GFX942-NEXT:    s_addc_u32 s5, s5, s7
+; GFX942-NEXT:    s_ashr_i64 s[6:7], s[4:5], 19
+; GFX942-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX942-NEXT:    s_add_u32 s4, s6, s4
+; GFX942-NEXT:    s_addc_u32 s5, s7, 0
+; GFX942-NEXT:    s_mul_i32 s5, s5, 0x12d8fb
+; GFX942-NEXT:    s_mul_hi_u32 s6, s4, 0x12d8fb
+; GFX942-NEXT:    s_add_i32 s6, s6, s5
+; GFX942-NEXT:    s_mul_i32 s4, s4, 0x12d8fb
+; GFX942-NEXT:    s_sub_u32 s2, s2, s4
+; GFX942-NEXT:    s_subb_u32 s3, s3, s6
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = srem i64 %x, 1235195
   store i64 %r, ptr addrspace(1) %out
   ret void
@@ -8968,6 +11933,22 @@ define amdgpu_kernel void @srem_i64_pow2k_denom(ptr addrspace(1) %out, i64 %x) {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_i64_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX942-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX942-NEXT:    s_add_u32 s4, s2, s4
+; GFX942-NEXT:    s_addc_u32 s5, s3, 0
+; GFX942-NEXT:    s_and_b32 s4, s4, 0xfffff000
+; GFX942-NEXT:    s_sub_u32 s2, s2, s4
+; GFX942-NEXT:    s_subb_u32 s3, s3, s5
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %r = srem i64 %x, 4096
   store i64 %r, ptr addrspace(1) %out
   ret void
@@ -9260,6 +12241,154 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v3, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v0, v[1:2], s[8:9]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_i64_pow2_shl_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b64 s[0:1], 0x1000, s0
+; GFX942-NEXT:    s_ashr_i32 s2, s1, 31
+; GFX942-NEXT:    s_add_u32 s0, s0, s2
+; GFX942-NEXT:    s_mov_b32 s3, s2
+; GFX942-NEXT:    s_addc_u32 s1, s1, s2
+; GFX942-NEXT:    s_xor_b64 s[6:7], s[0:1], s[2:3]
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GFX942-NEXT:    s_sub_u32 s0, 0, s6
+; GFX942-NEXT:    s_subb_u32 s1, 0, s7
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_fmamk_f32 v1, v2, 0xcf800000, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX942-NEXT:    s_mul_i32 s4, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s12, s0, s3
+; GFX942-NEXT:    s_mul_i32 s5, s1, s3
+; GFX942-NEXT:    s_add_i32 s4, s12, s4
+; GFX942-NEXT:    s_mul_i32 s13, s0, s3
+; GFX942-NEXT:    s_add_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s12, s3, s13
+; GFX942-NEXT:    s_mul_hi_u32 s5, s3, s4
+; GFX942-NEXT:    s_mul_i32 s3, s3, s4
+; GFX942-NEXT:    s_add_u32 s3, s12, s3
+; GFX942-NEXT:    s_addc_u32 s5, 0, s5
+; GFX942-NEXT:    s_mul_hi_u32 s14, s2, s13
+; GFX942-NEXT:    s_mul_i32 s13, s2, s13
+; GFX942-NEXT:    s_add_u32 s3, s3, s13
+; GFX942-NEXT:    s_mul_hi_u32 s12, s2, s4
+; GFX942-NEXT:    s_addc_u32 s3, s5, s14
+; GFX942-NEXT:    s_addc_u32 s5, s12, 0
+; GFX942-NEXT:    s_mul_i32 s4, s2, s4
+; GFX942-NEXT:    s_add_u32 s3, s3, s4
+; GFX942-NEXT:    s_addc_u32 s4, 0, s5
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s3, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s2, s4
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    s_mul_i32 s3, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s5, s0, s4
+; GFX942-NEXT:    s_add_i32 s3, s5, s3
+; GFX942-NEXT:    s_mul_i32 s1, s1, s4
+; GFX942-NEXT:    s_add_i32 s3, s3, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s4
+; GFX942-NEXT:    s_mul_hi_u32 s5, s2, s0
+; GFX942-NEXT:    s_mul_i32 s12, s2, s0
+; GFX942-NEXT:    s_mul_i32 s14, s4, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s4, s0
+; GFX942-NEXT:    s_mul_hi_u32 s13, s4, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s14
+; GFX942-NEXT:    s_addc_u32 s4, 0, s13
+; GFX942-NEXT:    s_add_u32 s0, s0, s12
+; GFX942-NEXT:    s_mul_hi_u32 s1, s2, s3
+; GFX942-NEXT:    s_addc_u32 s0, s4, s5
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s3, s2, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s3
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s2, s1
+; GFX942-NEXT:    s_ashr_i32 s4, s11, 31
+; GFX942-NEXT:    s_add_u32 s0, s10, s4
+; GFX942-NEXT:    s_mov_b32 s5, s4
+; GFX942-NEXT:    s_addc_u32 s1, s11, s4
+; GFX942-NEXT:    s_xor_b64 s[10:11], s[0:1], s[4:5]
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX942-NEXT:    s_mul_i32 s1, s10, s2
+; GFX942-NEXT:    s_mul_hi_u32 s5, s10, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s10, s2
+; GFX942-NEXT:    s_add_u32 s1, s5, s1
+; GFX942-NEXT:    s_addc_u32 s0, 0, s0
+; GFX942-NEXT:    s_mul_hi_u32 s12, s11, s3
+; GFX942-NEXT:    s_mul_i32 s3, s11, s3
+; GFX942-NEXT:    s_add_u32 s1, s1, s3
+; GFX942-NEXT:    s_mul_hi_u32 s5, s11, s2
+; GFX942-NEXT:    s_addc_u32 s0, s0, s12
+; GFX942-NEXT:    s_addc_u32 s1, s5, 0
+; GFX942-NEXT:    s_mul_i32 s2, s11, s2
+; GFX942-NEXT:    s_add_u32 s0, s0, s2
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    s_mul_i32 s1, s6, s1
+; GFX942-NEXT:    s_mul_hi_u32 s2, s6, s0
+; GFX942-NEXT:    s_add_i32 s1, s2, s1
+; GFX942-NEXT:    s_mul_i32 s2, s7, s0
+; GFX942-NEXT:    s_mul_i32 s0, s6, s0
+; GFX942-NEXT:    s_add_i32 s5, s1, s2
+; GFX942-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NEXT:    s_sub_i32 s1, s11, s5
+; GFX942-NEXT:    v_sub_co_u32_e32 v1, vcc, s10, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s10, s1, s7
+; GFX942-NEXT:    v_subrev_co_u32_e64 v2, s[0:1], s6, v1
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s12, s10, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s12, s7
+; GFX942-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[2:3], s6, v2
+; GFX942-NEXT:    s_cmp_eq_u32 s12, s7
+; GFX942-NEXT:    v_mov_b32_e32 v4, s13
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[2:3]
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[2:3]
+; GFX942-NEXT:    s_subb_u32 s2, s10, s7
+; GFX942-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s6, v2
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s2, s2, 0
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
+; GFX942-NEXT:    v_mov_b32_e32 v3, s12
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s11, s5
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s6, v1
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s7
+; GFX942-NEXT:    v_mov_b32_e32 v5, s1
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v5, s0
+; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX942-NEXT:    v_xor_b32_e32 v3, s4, v3
+; GFX942-NEXT:    v_subrev_co_u32_e32 v2, vcc, s4, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    s_endpgm
   %shl.y = shl i64 4096, %y
   %r = srem i64 %x, %shl.y
   store i64 %r, ptr addrspace(1) %out
@@ -9331,6 +12460,33 @@ define amdgpu_kernel void @srem_v2i64_pow2k_denom(ptr addrspace(1) %out, <2 x i6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s3
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_v2i64_pow2k_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s4, s1, 31
+; GFX942-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX942-NEXT:    s_add_u32 s4, s0, s4
+; GFX942-NEXT:    s_addc_u32 s5, s1, 0
+; GFX942-NEXT:    s_and_b32 s4, s4, 0xfffff000
+; GFX942-NEXT:    s_sub_u32 s0, s0, s4
+; GFX942-NEXT:    s_subb_u32 s1, s1, s5
+; GFX942-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX942-NEXT:    s_lshr_b32 s4, s4, 20
+; GFX942-NEXT:    s_add_u32 s4, s2, s4
+; GFX942-NEXT:    s_addc_u32 s5, s3, 0
+; GFX942-NEXT:    s_and_b32 s4, s4, 0xfffff000
+; GFX942-NEXT:    s_sub_u32 s2, s2, s4
+; GFX942-NEXT:    s_subb_u32 s3, s3, s5
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v5, s3
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %r = srem <2 x i64> %x, <i64 4096, i64 4096>
   store <2 x i64> %r, ptr addrspace(1) %out
   ret void
@@ -9886,6 +13042,295 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: srem_v2i64_pow2_shl_denom:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b64 s[0:1], 0x1000, s12
+; GFX942-NEXT:    s_lshl_b64 s[14:15], 0x1000, s14
+; GFX942-NEXT:    s_ashr_i32 s2, s1, 31
+; GFX942-NEXT:    s_add_u32 s0, s0, s2
+; GFX942-NEXT:    s_mov_b32 s3, s2
+; GFX942-NEXT:    s_addc_u32 s1, s1, s2
+; GFX942-NEXT:    s_xor_b64 s[12:13], s[0:1], s[2:3]
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s13
+; GFX942-NEXT:    s_sub_u32 s0, 0, s12
+; GFX942-NEXT:    s_subb_u32 s1, 0, s13
+; GFX942-NEXT:    v_fmac_f32_e32 v0, 0x4f800000, v1
+; GFX942-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_fmac_f32_e32 v0, 0xcf800000, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX942-NEXT:    s_mul_i32 s4, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s16, s0, s3
+; GFX942-NEXT:    s_mul_i32 s5, s1, s3
+; GFX942-NEXT:    s_add_i32 s4, s16, s4
+; GFX942-NEXT:    s_mul_i32 s17, s0, s3
+; GFX942-NEXT:    s_add_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s5, s3, s4
+; GFX942-NEXT:    s_mul_i32 s16, s3, s4
+; GFX942-NEXT:    s_mul_hi_u32 s3, s3, s17
+; GFX942-NEXT:    s_add_u32 s3, s3, s16
+; GFX942-NEXT:    s_addc_u32 s5, 0, s5
+; GFX942-NEXT:    s_mul_hi_u32 s18, s2, s17
+; GFX942-NEXT:    s_mul_i32 s17, s2, s17
+; GFX942-NEXT:    s_add_u32 s3, s3, s17
+; GFX942-NEXT:    s_mul_hi_u32 s16, s2, s4
+; GFX942-NEXT:    s_addc_u32 s3, s5, s18
+; GFX942-NEXT:    s_addc_u32 s5, s16, 0
+; GFX942-NEXT:    s_mul_i32 s4, s2, s4
+; GFX942-NEXT:    s_add_u32 s3, s3, s4
+; GFX942-NEXT:    s_addc_u32 s4, 0, s5
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, s3, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s2, s4
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    s_mul_i32 s3, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s5, s0, s4
+; GFX942-NEXT:    s_add_i32 s3, s5, s3
+; GFX942-NEXT:    s_mul_i32 s1, s1, s4
+; GFX942-NEXT:    s_add_i32 s3, s3, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s4
+; GFX942-NEXT:    s_mul_hi_u32 s5, s2, s0
+; GFX942-NEXT:    s_mul_i32 s16, s2, s0
+; GFX942-NEXT:    s_mul_i32 s18, s4, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s4, s0
+; GFX942-NEXT:    s_mul_hi_u32 s17, s4, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s18
+; GFX942-NEXT:    s_addc_u32 s4, 0, s17
+; GFX942-NEXT:    s_add_u32 s0, s0, s16
+; GFX942-NEXT:    s_mul_hi_u32 s1, s2, s3
+; GFX942-NEXT:    s_addc_u32 s0, s4, s5
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s3, s2, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s3
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s2, s1
+; GFX942-NEXT:    s_ashr_i32 s16, s9, 31
+; GFX942-NEXT:    s_add_u32 s0, s8, s16
+; GFX942-NEXT:    s_mov_b32 s17, s16
+; GFX942-NEXT:    s_addc_u32 s1, s9, s16
+; GFX942-NEXT:    s_xor_b64 s[4:5], s[0:1], s[16:17]
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX942-NEXT:    s_mul_i32 s1, s4, s2
+; GFX942-NEXT:    s_mul_hi_u32 s8, s4, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s4, s2
+; GFX942-NEXT:    s_add_u32 s1, s8, s1
+; GFX942-NEXT:    s_addc_u32 s0, 0, s0
+; GFX942-NEXT:    s_mul_hi_u32 s9, s5, s3
+; GFX942-NEXT:    s_mul_i32 s3, s5, s3
+; GFX942-NEXT:    s_add_u32 s1, s1, s3
+; GFX942-NEXT:    s_mul_hi_u32 s8, s5, s2
+; GFX942-NEXT:    s_addc_u32 s0, s0, s9
+; GFX942-NEXT:    s_addc_u32 s1, s8, 0
+; GFX942-NEXT:    s_mul_i32 s2, s5, s2
+; GFX942-NEXT:    s_add_u32 s0, s0, s2
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    s_mul_i32 s1, s12, s1
+; GFX942-NEXT:    s_mul_hi_u32 s2, s12, s0
+; GFX942-NEXT:    s_add_i32 s1, s2, s1
+; GFX942-NEXT:    s_mul_i32 s2, s13, s0
+; GFX942-NEXT:    s_mul_i32 s0, s12, s0
+; GFX942-NEXT:    s_add_i32 s8, s1, s2
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    s_sub_i32 s1, s5, s8
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, s4, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s4, s1, s13
+; GFX942-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s12, v0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s9, s4, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s9, s13
+; GFX942-NEXT:    s_cselect_b32 s17, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v1
+; GFX942-NEXT:    s_cmp_eq_u32 s9, s13
+; GFX942-NEXT:    v_mov_b32_e32 v3, s17
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[2:3]
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[2:3]
+; GFX942-NEXT:    s_subb_u32 s2, s4, s13
+; GFX942-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s12, v1
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s2, s2, 0
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, s9
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s5, s8
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s13
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s13
+; GFX942-NEXT:    v_mov_b32_e32 v5, s1
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, s0
+; GFX942-NEXT:    s_ashr_i32 s0, s15, 31
+; GFX942-NEXT:    s_add_u32 s2, s14, s0
+; GFX942-NEXT:    s_mov_b32 s1, s0
+; GFX942-NEXT:    s_addc_u32 s3, s15, s0
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX942-NEXT:    s_xor_b64 s[4:5], s[2:3], s[0:1]
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, s5
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s4
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v0, s16, v0
+; GFX942-NEXT:    v_xor_b32_e32 v2, s16, v2
+; GFX942-NEXT:    v_fmac_f32_e32 v1, 0x4f800000, v3
+; GFX942-NEXT:    v_rcp_f32_e32 v3, v1
+; GFX942-NEXT:    v_mov_b32_e32 v5, s16
+; GFX942-NEXT:    v_subrev_co_u32_e32 v0, vcc, s16, v0
+; GFX942-NEXT:    s_sub_u32 s0, 0, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v5, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v3
+; GFX942-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fmac_f32_e32 v2, 0xcf800000, v3
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX942-NEXT:    s_subb_u32 s1, 0, s5
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s9, v3
+; GFX942-NEXT:    s_mul_hi_u32 s8, s0, s2
+; GFX942-NEXT:    s_mul_i32 s12, s0, s9
+; GFX942-NEXT:    s_mul_i32 s3, s1, s2
+; GFX942-NEXT:    s_add_i32 s8, s8, s12
+; GFX942-NEXT:    s_add_i32 s8, s8, s3
+; GFX942-NEXT:    s_mul_i32 s13, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s3, s2, s8
+; GFX942-NEXT:    s_mul_i32 s12, s2, s8
+; GFX942-NEXT:    s_mul_hi_u32 s2, s2, s13
+; GFX942-NEXT:    s_add_u32 s2, s2, s12
+; GFX942-NEXT:    s_addc_u32 s3, 0, s3
+; GFX942-NEXT:    s_mul_hi_u32 s14, s9, s13
+; GFX942-NEXT:    s_mul_i32 s13, s9, s13
+; GFX942-NEXT:    s_add_u32 s2, s2, s13
+; GFX942-NEXT:    s_mul_hi_u32 s12, s9, s8
+; GFX942-NEXT:    s_addc_u32 s2, s3, s14
+; GFX942-NEXT:    s_addc_u32 s3, s12, 0
+; GFX942-NEXT:    s_mul_i32 s8, s9, s8
+; GFX942-NEXT:    s_add_u32 s2, s2, s8
+; GFX942-NEXT:    s_addc_u32 s3, 0, s3
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, s2, v2
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s9, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX942-NEXT:    s_mul_i32 s3, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s9, s0, s8
+; GFX942-NEXT:    s_add_i32 s3, s9, s3
+; GFX942-NEXT:    s_mul_i32 s1, s1, s8
+; GFX942-NEXT:    s_add_i32 s3, s3, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s8
+; GFX942-NEXT:    s_mul_hi_u32 s9, s2, s0
+; GFX942-NEXT:    s_mul_i32 s12, s2, s0
+; GFX942-NEXT:    s_mul_i32 s14, s8, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s8, s0
+; GFX942-NEXT:    s_mul_hi_u32 s13, s8, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s14
+; GFX942-NEXT:    s_addc_u32 s8, 0, s13
+; GFX942-NEXT:    s_add_u32 s0, s0, s12
+; GFX942-NEXT:    s_mul_hi_u32 s1, s2, s3
+; GFX942-NEXT:    s_addc_u32 s0, s8, s9
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s3, s2, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s3
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s2, s1
+; GFX942-NEXT:    s_ashr_i32 s8, s11, 31
+; GFX942-NEXT:    s_add_u32 s0, s10, s8
+; GFX942-NEXT:    s_mov_b32 s9, s8
+; GFX942-NEXT:    s_addc_u32 s1, s11, s8
+; GFX942-NEXT:    s_xor_b64 s[10:11], s[0:1], s[8:9]
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX942-NEXT:    s_mul_i32 s1, s10, s2
+; GFX942-NEXT:    s_mul_hi_u32 s9, s10, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s10, s2
+; GFX942-NEXT:    s_add_u32 s1, s9, s1
+; GFX942-NEXT:    s_addc_u32 s0, 0, s0
+; GFX942-NEXT:    s_mul_hi_u32 s12, s11, s3
+; GFX942-NEXT:    s_mul_i32 s3, s11, s3
+; GFX942-NEXT:    s_add_u32 s1, s1, s3
+; GFX942-NEXT:    s_mul_hi_u32 s9, s11, s2
+; GFX942-NEXT:    s_addc_u32 s0, s0, s12
+; GFX942-NEXT:    s_addc_u32 s1, s9, 0
+; GFX942-NEXT:    s_mul_i32 s2, s11, s2
+; GFX942-NEXT:    s_add_u32 s0, s0, s2
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    s_mul_i32 s1, s4, s1
+; GFX942-NEXT:    s_mul_hi_u32 s2, s4, s0
+; GFX942-NEXT:    s_add_i32 s1, s2, s1
+; GFX942-NEXT:    s_mul_i32 s2, s5, s0
+; GFX942-NEXT:    s_mul_i32 s0, s4, s0
+; GFX942-NEXT:    s_add_i32 s9, s1, s2
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    s_sub_i32 s1, s11, s9
+; GFX942-NEXT:    v_sub_co_u32_e32 v2, vcc, s10, v2
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s10, s1, s5
+; GFX942-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s4, v2
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s12, s10, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s12, s5
+; GFX942-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[2:3], s4, v3
+; GFX942-NEXT:    s_cmp_eq_u32 s12, s5
+; GFX942-NEXT:    v_mov_b32_e32 v6, s13
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[2:3]
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, v6, v5, s[2:3]
+; GFX942-NEXT:    s_subb_u32 s2, s10, s5
+; GFX942-NEXT:    v_subrev_co_u32_e64 v6, s[0:1], s4, v3
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s2, s2, 0
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
+; GFX942-NEXT:    v_mov_b32_e32 v5, s12
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v6, s2
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s11, s9
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s5
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s5
+; GFX942-NEXT:    v_mov_b32_e32 v7, s1
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v6, v7, v6, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX942-NEXT:    v_mov_b32_e32 v7, s0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v2, s8, v2
+; GFX942-NEXT:    v_xor_b32_e32 v3, s8, v5
+; GFX942-NEXT:    v_mov_b32_e32 v5, s8
+; GFX942-NEXT:    v_subrev_co_u32_e32 v2, vcc, s8, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %shl.y = shl <2 x i64> <i64 4096, i64 4096>, %y
   %r = srem <2 x i64> %x, %shl.y
   store <2 x i64> %r, ptr addrspace(1) %out
@@ -9915,6 +13360,13 @@ define <2 x i32> @v_sdiv_i32_exact(<2 x i32> %num) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_sdiv_i32_exact:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v0, 12, v0
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 10, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
    %result = sdiv exact <2 x i32> %num, <i32 4096, i32 1024>
    ret <2 x i32> %result
 }
@@ -9942,6 +13394,13 @@ define <2 x i64> @v_sdiv_i64_exact(<2 x i64> %num) {
 ; GFX9-NEXT:    v_ashrrev_i64 v[0:1], 12, v[0:1]
 ; GFX9-NEXT:    v_ashrrev_i64 v[2:3], 10, v[2:3]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_sdiv_i64_exact:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i64 v[0:1], 12, v[0:1]
+; GFX942-NEXT:    v_ashrrev_i64 v[2:3], 10, v[2:3]
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
    %result = sdiv exact <2 x i64> %num, <i64 4096, i64 1024>
    ret <2 x i64> %result
 }
@@ -9969,6 +13428,13 @@ define <2 x i32> @v_udiv_i32_exact(<2 x i32> %num) {
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 12, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_udiv_i32_exact:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 12, v0
+; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 10, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
    %result = udiv exact <2 x i32> %num, <i32 4096, i32 1024>
    ret <2 x i32> %result
 }
@@ -9996,6 +13462,13 @@ define <2 x i64> @v_udiv_i64_exact(<2 x i64> %num) {
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 12, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], 10, v[2:3]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_udiv_i64_exact:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b64 v[0:1], 12, v[0:1]
+; GFX942-NEXT:    v_lshrrev_b64 v[2:3], 10, v[2:3]
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
    %result = udiv exact <2 x i64> %num, <i64 4096, i64 1024>
    ret <2 x i64> %result
 }
@@ -10049,6 +13522,29 @@ define i64 @udiv_i64_gt_smax(i8 %size) {
 ; GFX9-NEXT:    v_alignbit_b32 v0, v1, v0, 3
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 3, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: udiv_i64_gt_smax:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, 31
+; GFX942-NEXT:    v_ashrrev_i32_sdwa v1, v1, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX942-NEXT:    v_not_b32_sdwa v5, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX942-NEXT:    s_mov_b32 s0, 0xcccccccd
+; GFX942-NEXT:    v_not_b32_e32 v4, v1
+; GFX942-NEXT:    v_mul_hi_u32 v0, v5, s0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v4, s0, v[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v0, v3
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    s_mov_b32 s2, 0xcccccccc
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v5, s2, v[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v2, v3
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v4, s2, v[0:1]
+; GFX942-NEXT:    v_alignbit_b32 v0, v1, v0, 3
+; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 3, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %esize = sext i8 %size to i64
   %minus = sub nuw nsw i64 -1, %esize
   %div = udiv i64 %minus, 10
@@ -10089,6 +13585,24 @@ define i64 @udiv_i64_9divbits(i8 %size) {
 ; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
 ; GFX9-NEXT:    v_and_b32_e32 v0, 0x1ff, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: udiv_i64_9divbits:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, 1
+; GFX942-NEXT:    v_add_u32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX942-NEXT:    s_mov_b32 s0, 0x41200000
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x3dcccccd, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0xc1200000, v0
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x1ff, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %zextend = zext i8 %size to i64
   %num = add nuw nsw i64 1, %zextend
   %div = udiv i64 %num, 10
@@ -10105,6 +13619,11 @@ define <2 x i64> @srem_zero_zero() {
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: srem_zero_zero:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %B = srem <2 x i64> zeroinitializer, zeroinitializer
   ret <2 x i64> %B
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 4cc39d93854a0..b4a15d4e00a6a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX942,GFX942_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-TRUE16,GFX1164_ITERATIVE,GFX1164_ITERATIVE-TRUE16 %s
@@ -15,6 +16,7 @@
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX942,GFX942_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=+wavefrontsize64 -mattr=-flat-for-global -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164-TRUE16,GFX1164_DPP,GFX1164_DPP-TRUE16 %s
@@ -127,6 +129,39 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: add_i32_constant:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b64 s[6:7], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    ; implicit-def: $vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB0_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX942-NEXT:    s_mul_i32 s2, s2, 5
+; GFX942-NEXT:    s_mov_b32 s11, 0xf000
+; GFX942-NEXT:    s_mov_b32 s10, -1
+; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    buffer_wbl2 sc1
+; GFX942-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_inv sc1
+; GFX942-NEXT:  .LBB0_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: add_i32_constant:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -436,6 +471,41 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: add_i32_uniform:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s8, s[4:5], 0x34
+; GFX942-NEXT:    s_mov_b64 s[6:7], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    ; implicit-def: $vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB1_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX942-NEXT:    s_mul_i32 s2, s8, s2
+; GFX942-NEXT:    s_mov_b32 s15, 0xf000
+; GFX942-NEXT:    s_mov_b32 s14, -1
+; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    buffer_wbl2 sc1
+; GFX942-NEXT:    buffer_atomic_add v1, off, s[12:15], 0 sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_inv sc1
+; GFX942-NEXT:  .LBB1_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mul_lo_u32 v0, s8, v0
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: add_i32_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_clause 0x1
@@ -798,6 +868,53 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: add_i32_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s6, 0
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
+; GFX942_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s2
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s7, v1, s2
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX942_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s10, -1
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942_ITERATIVE-NEXT:    buffer_wbl2 sc1
+; GFX942_ITERATIVE-NEXT:    buffer_atomic_add v1, off, s[8:11], 0 sc0
+; GFX942_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_inv sc1
+; GFX942_ITERATIVE-NEXT:  .LBB2_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: add_i32_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
@@ -1200,6 +1317,59 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: add_i32_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[4:5]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s6, v2, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr0
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB2_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    s_mov_b32 s11, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s10, -1
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    s_mov_b32 s8, s2
+; GFX942_DPP-NEXT:    s_mov_b32 s9, s3
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942_DPP-NEXT:    buffer_wbl2 sc1
+; GFX942_DPP-NEXT:    buffer_atomic_add v0, off, s[8:11], 0 sc0
+; GFX942_DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942_DPP-NEXT:    buffer_inv sc1
+; GFX942_DPP-NEXT:  .LBB2_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: add_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -1684,6 +1854,43 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: add_i64_constant:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b64 s[6:7], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB3_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX942-NEXT:    s_mul_i32 s2, s2, 5
+; GFX942-NEXT:    s_mov_b32 s11, 0xf000
+; GFX942-NEXT:    s_mov_b32 s10, -1
+; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    buffer_wbl2 sc1
+; GFX942-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_inv sc1
+; GFX942-NEXT:  .LBB3_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, 5, v[0:1]
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: add_i64_constant:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2033,6 +2240,50 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: add_i64_uniform:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB4_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
+; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mul_i32 s3, s7, s2
+; GFX942-NEXT:    s_mul_hi_u32 s8, s6, s2
+; GFX942-NEXT:    s_add_i32 s8, s8, s3
+; GFX942-NEXT:    s_mul_i32 s2, s6, s2
+; GFX942-NEXT:    s_mov_b32 s15, 0xf000
+; GFX942-NEXT:    s_mov_b32 s14, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    v_mov_b32_e32 v1, s8
+; GFX942-NEXT:    buffer_wbl2 sc1
+; GFX942-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_inv sc1
+; GFX942-NEXT:  .LBB4_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v2, v[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v4, v1
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s7, v2, v[4:5]
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: add_i64_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_clause 0x1
@@ -2451,6 +2702,58 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: add_i64_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s2
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s8, v2, s2
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s2
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX942_ITERATIVE-NEXT:    s_add_u32 s6, s6, s8
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, m0
+; GFX942_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB5_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s10, -1
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX942_ITERATIVE-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942_ITERATIVE-NEXT:    buffer_wbl2 sc1
+; GFX942_ITERATIVE-NEXT:    buffer_atomic_add_x2 v[2:3], off, s[8:11], 0 sc0
+; GFX942_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_inv sc1
+; GFX942_ITERATIVE-NEXT:  .LBB5_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    v_lshl_add_u64 v[0:1], s[4:5], 0, v[0:1]
+; GFX942_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: add_i64_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
@@ -2960,6 +3263,87 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[6:7], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: add_i64_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s[4:5]
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX942_DPP-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v3, v5 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB5_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    s_mov_b32 s11, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s10, -1
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    s_mov_b32 s8, s2
+; GFX942_DPP-NEXT:    s_mov_b32 s9, s3
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942_DPP-NEXT:    buffer_wbl2 sc1
+; GFX942_DPP-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 sc0
+; GFX942_DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942_DPP-NEXT:    buffer_inv sc1
+; GFX942_DPP-NEXT:  .LBB5_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[0:1], s[4:5], 0, v[0:1]
+; GFX942_DPP-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: add_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -3650,6 +4034,40 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: sub_i32_constant:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b64 s[6:7], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    ; implicit-def: $vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB6_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX942-NEXT:    s_mul_i32 s2, s2, 5
+; GFX942-NEXT:    s_mov_b32 s11, 0xf000
+; GFX942-NEXT:    s_mov_b32 s10, -1
+; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    buffer_wbl2 sc1
+; GFX942-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_inv sc1
+; GFX942-NEXT:  .LBB6_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: sub_i32_constant:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -3969,6 +4387,41 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: sub_i32_uniform:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s8, s[4:5], 0x34
+; GFX942-NEXT:    s_mov_b64 s[6:7], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, s7, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    ; implicit-def: $vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB7_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX942-NEXT:    s_mul_i32 s2, s8, s2
+; GFX942-NEXT:    s_mov_b32 s15, 0xf000
+; GFX942-NEXT:    s_mov_b32 s14, -1
+; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    buffer_wbl2 sc1
+; GFX942-NEXT:    buffer_atomic_sub v1, off, s[12:15], 0 sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_inv sc1
+; GFX942-NEXT:  .LBB7_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mul_lo_u32 v0, s8, v0
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: sub_i32_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_clause 0x1
@@ -4333,6 +4786,53 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: sub_i32_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s6, 0
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
+; GFX942_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s2
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s7, v1, s2
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX942_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB8_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s10, -1
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942_ITERATIVE-NEXT:    buffer_wbl2 sc1
+; GFX942_ITERATIVE-NEXT:    buffer_atomic_sub v1, off, s[8:11], 0 sc0
+; GFX942_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_inv sc1
+; GFX942_ITERATIVE-NEXT:  .LBB8_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX942_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: sub_i32_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
@@ -4735,6 +5235,59 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: sub_i32_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[4:5]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s6, v2, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr0
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB8_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    s_mov_b32 s11, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s10, -1
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    s_mov_b32 s8, s2
+; GFX942_DPP-NEXT:    s_mov_b32 s9, s3
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942_DPP-NEXT:    buffer_wbl2 sc1
+; GFX942_DPP-NEXT:    buffer_atomic_sub v0, off, s[8:11], 0 sc0
+; GFX942_DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942_DPP-NEXT:    buffer_inv sc1
+; GFX942_DPP-NEXT:  .LBB8_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX942_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: sub_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -5221,6 +5774,45 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: sub_i64_constant:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b64 s[6:7], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB9_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_bcnt1_i32_b64 s2, s[6:7]
+; GFX942-NEXT:    s_mul_i32 s2, s2, 5
+; GFX942-NEXT:    s_mov_b32 s11, 0xf000
+; GFX942-NEXT:    s_mov_b32 s10, -1
+; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    buffer_wbl2 sc1
+; GFX942-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_inv sc1
+; GFX942-NEXT:  .LBB9_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
+; GFX942-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: sub_i64_constant:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -5589,6 +6181,50 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: sub_i64_uniform:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    s_mov_b64 s[8:9], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s8, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, s9, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB10_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s12, s2
+; GFX942-NEXT:    s_bcnt1_i32_b64 s2, s[8:9]
+; GFX942-NEXT:    s_mov_b32 s13, s3
+; GFX942-NEXT:    s_mul_i32 s3, s7, s2
+; GFX942-NEXT:    s_mul_hi_u32 s8, s6, s2
+; GFX942-NEXT:    s_add_i32 s8, s8, s3
+; GFX942-NEXT:    s_mul_i32 s2, s6, s2
+; GFX942-NEXT:    s_mov_b32 s15, 0xf000
+; GFX942-NEXT:    s_mov_b32 s14, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    v_mov_b32_e32 v1, s8
+; GFX942-NEXT:    buffer_wbl2 sc1
+; GFX942-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    buffer_inv sc1
+; GFX942-NEXT:  .LBB10_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s9, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], s6, v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v4, v1
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[4:5], s7, v2, v[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, s8
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, s9, v0
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: sub_i64_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_clause 0x1
@@ -6017,6 +6653,61 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: sub_i64_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[6:7], 0
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s2
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s8, v2, s2
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s2
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX942_ITERATIVE-NEXT:    s_add_u32 s6, s6, s8
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, m0
+; GFX942_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s11, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s10, -1
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s8, s2
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s9, s3
+; GFX942_ITERATIVE-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942_ITERATIVE-NEXT:    buffer_wbl2 sc1
+; GFX942_ITERATIVE-NEXT:    buffer_atomic_sub_x2 v[2:3], off, s[8:11], 0 sc0
+; GFX942_ITERATIVE-NEXT:    s_waitcnt vmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_inv sc1
+; GFX942_ITERATIVE-NEXT:  .LBB11_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942_ITERATIVE-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v0
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    s_nop 0
+; GFX942_ITERATIVE-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX942_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: sub_i64_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
@@ -6526,6 +7217,89 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[6:7], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: sub_i64_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v5, 0, 0, s[4:5]
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s7, v5, 63
+; GFX942_DPP-NEXT:    v_readlane_b32 s6, v4, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v3, v5 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB11_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    s_mov_b32 s11, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s10, -1
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    s_mov_b32 s8, s2
+; GFX942_DPP-NEXT:    s_mov_b32 s9, s3
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[0:1], s[6:7]
+; GFX942_DPP-NEXT:    buffer_wbl2 sc1
+; GFX942_DPP-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 sc0
+; GFX942_DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942_DPP-NEXT:    buffer_inv sc1
+; GFX942_DPP-NEXT:  .LBB11_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v8, s4
+; GFX942_DPP-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_subb_co_u32_e32 v1, vcc, v8, v1, vcc
+; GFX942_DPP-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: sub_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -7228,6 +8002,43 @@ define amdgpu_kernel void @uniform_or_i8(ptr addrspace(1) %result, ptr addrspace
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: uniform_or_i8:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    ; implicit-def: $vgpr0
+; GFX942-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB12_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s8, s2, -4
+; GFX942-NEXT:    s_and_b32 s2, s2, 3
+; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX942-NEXT:    s_and_b32 s3, s6, 0xff
+; GFX942-NEXT:    s_lshl_b32 s3, s3, s2
+; GFX942-NEXT:    s_mov_b32 s11, 0xf000
+; GFX942-NEXT:    s_mov_b32 s10, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, s3
+; GFX942-NEXT:    buffer_atomic_or v0, off, s[8:11], 0 sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX942-NEXT:  .LBB12_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_or_b32_e32 v0, s4, v0
+; GFX942-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: uniform_or_i8:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_clause 0x1
@@ -7800,6 +8611,64 @@ define amdgpu_kernel void @uniform_add_i8(ptr addrspace(1) %result, ptr addrspac
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: uniform_add_i8:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x34
+; GFX942-NEXT:    s_mov_b64 s[6:7], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-NEXT:    ; implicit-def: $vgpr0
+; GFX942-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB13_4
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s4, s[6:7]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_i32 s6, s10, s4
+; GFX942-NEXT:    s_and_b32 s4, s2, -4
+; GFX942-NEXT:    s_mov_b32 s5, s3
+; GFX942-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX942-NEXT:    s_and_b32 s2, s2, 3
+; GFX942-NEXT:    s_lshl_b32 s11, s2, 3
+; GFX942-NEXT:    s_lshl_b32 s12, 0xff, s11
+; GFX942-NEXT:    s_and_b32 s2, s6, 0xff
+; GFX942-NEXT:    s_not_b32 s13, s12
+; GFX942-NEXT:    s_lshl_b32 s14, s2, s11
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s7
+; GFX942-NEXT:    s_mov_b32 s7, 0xf000
+; GFX942-NEXT:    s_mov_b32 s6, -1
+; GFX942-NEXT:  .LBB13_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_add_u32_e32 v0, s14, v1
+; GFX942-NEXT:    v_and_b32_e32 v0, s12, v0
+; GFX942-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX942-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
+; GFX942-NEXT:  .LBB13_4: ; %Flow
+; GFX942-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xff, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-NEXT:    v_mad_legacy_u16 v0, s10, v4, v0
+; GFX942-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: uniform_add_i8:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_clause 0x1
@@ -8559,6 +9428,45 @@ define amdgpu_kernel void @uniform_xchg_i8(ptr addrspace(1) %result, ptr addrspa
 ; GFX9-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: uniform_xchg_i8:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s4, s2, -4
+; GFX942-NEXT:    s_mov_b32 s5, s3
+; GFX942-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX942-NEXT:    s_and_b32 s2, s2, 3
+; GFX942-NEXT:    s_lshl_b32 s8, s2, 3
+; GFX942-NEXT:    s_lshl_b32 s2, 0xff, s8
+; GFX942-NEXT:    s_not_b32 s9, s2
+; GFX942-NEXT:    s_and_b32 s2, s6, 0xff
+; GFX942-NEXT:    s_lshl_b32 s10, s2, s8
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s7
+; GFX942-NEXT:    s_mov_b32 s7, 0xf000
+; GFX942-NEXT:    s_mov_b32 s6, -1
+; GFX942-NEXT:  .LBB14_1: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_and_b32_e32 v0, s9, v1
+; GFX942-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB14_1
+; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
+; GFX942-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: uniform_xchg_i8:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_clause 0x1
@@ -8921,6 +9829,43 @@ define amdgpu_kernel void @uniform_or_i16(ptr addrspace(1) %result, ptr addrspac
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: uniform_or_i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    ; implicit-def: $vgpr0
+; GFX942-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB15_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s8, s2, -4
+; GFX942-NEXT:    s_and_b32 s2, s2, 3
+; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX942-NEXT:    s_and_b32 s3, 0xffff, s6
+; GFX942-NEXT:    s_lshl_b32 s3, s3, s2
+; GFX942-NEXT:    s_mov_b32 s11, 0xf000
+; GFX942-NEXT:    s_mov_b32 s10, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, s3
+; GFX942-NEXT:    buffer_atomic_or v0, off, s[8:11], 0 sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, s2, v0
+; GFX942-NEXT:  .LBB15_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v0, 0, vcc
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_or_b32_e32 v0, s4, v0
+; GFX942-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: uniform_or_i16:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_clause 0x1
@@ -9493,6 +10438,64 @@ define amdgpu_kernel void @uniform_add_i16(ptr addrspace(1) %result, ptr addrspa
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: uniform_add_i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x34
+; GFX942-NEXT:    s_mov_b64 s[6:7], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v4, s7, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-NEXT:    ; implicit-def: $vgpr0
+; GFX942-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB16_4
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s4, s[6:7]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_i32 s6, s10, s4
+; GFX942-NEXT:    s_and_b32 s4, s2, -4
+; GFX942-NEXT:    s_mov_b32 s5, s3
+; GFX942-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX942-NEXT:    s_and_b32 s2, s2, 3
+; GFX942-NEXT:    s_lshl_b32 s11, s2, 3
+; GFX942-NEXT:    s_lshl_b32 s12, 0xffff, s11
+; GFX942-NEXT:    s_and_b32 s2, s6, 0xffff
+; GFX942-NEXT:    s_not_b32 s13, s12
+; GFX942-NEXT:    s_lshl_b32 s14, s2, s11
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s7
+; GFX942-NEXT:    s_mov_b32 s7, 0xf000
+; GFX942-NEXT:    s_mov_b32 s6, -1
+; GFX942-NEXT:  .LBB16_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_add_u32_e32 v0, s14, v1
+; GFX942-NEXT:    v_and_b32_e32 v0, s12, v0
+; GFX942-NEXT:    v_and_or_b32 v0, v1, s13, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB16_2
+; GFX942-NEXT:  ; %bb.3: ; %atomicrmw.end
+; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, s11, v2
+; GFX942-NEXT:  .LBB16_4: ; %Flow
+; GFX942-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-NEXT:    v_mad_legacy_u16 v0, s10, v4, v0
+; GFX942-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: uniform_add_i16:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_clause 0x1
@@ -10252,6 +11255,45 @@ define amdgpu_kernel void @uniform_xchg_i16(ptr addrspace(1) %result, ptr addrsp
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: uniform_xchg_i16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s4, s2, -4
+; GFX942-NEXT:    s_mov_b32 s5, s3
+; GFX942-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX942-NEXT:    s_and_b32 s2, s2, 3
+; GFX942-NEXT:    s_lshl_b32 s8, s2, 3
+; GFX942-NEXT:    s_lshl_b32 s2, 0xffff, s8
+; GFX942-NEXT:    s_not_b32 s9, s2
+; GFX942-NEXT:    s_and_b32 s2, s6, 0xffff
+; GFX942-NEXT:    s_lshl_b32 s10, s2, s8
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s7
+; GFX942-NEXT:    s_mov_b32 s7, 0xf000
+; GFX942-NEXT:    s_mov_b32 s6, -1
+; GFX942-NEXT:  .LBB17_1: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_and_b32_e32 v0, s9, v1
+; GFX942-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB17_1
+; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, s8, v2
+; GFX942-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: uniform_xchg_i16:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_clause 0x1
@@ -10631,6 +11673,45 @@ define amdgpu_kernel void @uniform_fadd_f16(ptr addrspace(1) %result, ptr addrsp
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: uniform_fadd_f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s11, s[4:5], 0x34
+; GFX942-NEXT:    s_mov_b64 s[8:9], 0
+; GFX942-NEXT:    s_mov_b32 s7, 0xf000
+; GFX942-NEXT:    s_mov_b32 s6, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s4, s2, -4
+; GFX942-NEXT:    s_mov_b32 s5, s3
+; GFX942-NEXT:    s_load_dword s3, s[4:5], 0x0
+; GFX942-NEXT:    s_and_b32 s2, s2, 3
+; GFX942-NEXT:    s_lshl_b32 s10, s2, 3
+; GFX942-NEXT:    s_lshl_b32 s2, 0xffff, s10
+; GFX942-NEXT:    s_not_b32 s2, s2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-NEXT:  .LBB18_1: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, s10, v1
+; GFX942-NEXT:    v_add_f16_e32 v0, s11, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, s10, v0
+; GFX942-NEXT:    v_and_or_b32 v0, v1, s2, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX942-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX942-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
+; GFX942-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: uniform_fadd_f16:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_clause 0x1
@@ -11233,6 +12314,54 @@ define amdgpu_kernel void @uniform_fadd_bf16(ptr addrspace(1) %result, ptr addrs
 ; GFX9-NEXT:    buffer_store_short v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: uniform_fadd_bf16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x34
+; GFX942-NEXT:    s_mov_b64 s[8:9], 0
+; GFX942-NEXT:    s_movk_i32 s11, 0x7fff
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s4, s2, -4
+; GFX942-NEXT:    s_mov_b32 s5, s3
+; GFX942-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX942-NEXT:    s_and_b32 s3, s2, 3
+; GFX942-NEXT:    s_lshl_b32 s10, s3, 3
+; GFX942-NEXT:    s_lshl_b32 s3, 0xffff, s10
+; GFX942-NEXT:    s_lshl_b32 s2, s6, 16
+; GFX942-NEXT:    s_not_b32 s3, s3
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s7
+; GFX942-NEXT:    s_mov_b32 s7, 0xf000
+; GFX942-NEXT:    s_mov_b32 s6, -1
+; GFX942-NEXT:  .LBB19_1: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_lshrrev_b32_sdwa v0, s10, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_add_f32_e32 v0, s2, v0
+; GFX942-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX942-NEXT:    v_add3_u32 v2, v2, v0, s11
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX942-NEXT:    v_lshlrev_b32_sdwa v0, s10, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX942-NEXT:    v_and_or_b32 v0, v1, s3, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX942-NEXT:    s_cbranch_execnz .LBB19_1
+; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX942-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, s10, v2
+; GFX942-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: uniform_fadd_bf16:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_clause 0x1
@@ -11907,6 +13036,38 @@ define amdgpu_kernel void @uniform_fadd_v2f16(ptr addrspace(1) %result, ptr addr
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: uniform_fadd_v2f16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s10, s[4:5], 0x34
+; GFX942-NEXT:    s_mov_b64 s[8:9], 0
+; GFX942-NEXT:    s_mov_b32 s7, 0xf000
+; GFX942-NEXT:    s_mov_b32 s6, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s5, s[2:3], 0x0
+; GFX942-NEXT:    s_mov_b32 s4, s2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-NEXT:    s_mov_b32 s5, s3
+; GFX942-NEXT:  .LBB20_1: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_pk_add_f16 v0, v1, s10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    s_or_b64 s[8:9], vcc, s[8:9]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[8:9]
+; GFX942-NEXT:    s_cbranch_execnz .LBB20_1
+; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX942-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    buffer_store_dword v2, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: uniform_fadd_v2f16:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_clause 0x1
@@ -12273,6 +13434,56 @@ define amdgpu_kernel void @uniform_fadd_v2bf16(ptr addrspace(1) %result, ptr add
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[8:11], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: uniform_fadd_v2bf16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x34
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_movk_i32 s12, 0x7fff
+; GFX942-NEXT:    s_mov_b32 s13, 0x7060302
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s1, s[10:11], 0x0
+; GFX942-NEXT:    s_mov_b32 s7, 0xf000
+; GFX942-NEXT:    s_mov_b32 s6, -1
+; GFX942-NEXT:    s_lshl_b32 s14, s0, 16
+; GFX942-NEXT:    s_and_b32 s15, s0, 0xffff0000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NEXT:    s_mov_b32 s4, s10
+; GFX942-NEXT:    s_mov_b32 s5, s11
+; GFX942-NEXT:  .LBB21_1: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v1
+; GFX942-NEXT:    v_and_b32_e32 v2, 0xffff0000, v1
+; GFX942-NEXT:    v_add_f32_e32 v0, s14, v0
+; GFX942-NEXT:    v_add_f32_e32 v2, s15, v2
+; GFX942-NEXT:    v_bfe_u32 v3, v0, 16, 1
+; GFX942-NEXT:    v_bfe_u32 v5, v2, 16, 1
+; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v0
+; GFX942-NEXT:    v_or_b32_e32 v6, 0x400000, v2
+; GFX942-NEXT:    v_add3_u32 v3, v3, v0, s12
+; GFX942-NEXT:    v_add3_u32 v5, v5, v2, s12
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v2, v2
+; GFX942-NEXT:    v_cmp_u_f32_e64 s[0:1], v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v3, v4, s[0:1]
+; GFX942-NEXT:    v_perm_b32 v0, v2, v0, s13
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB21_1
+; GFX942-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_mov_b32 s11, 0xf000
+; GFX942-NEXT:    s_mov_b32 s10, -1
+; GFX942-NEXT:    buffer_store_dword v2, off, s[8:11], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: uniform_fadd_v2bf16:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_clause 0x1
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 17737cccec7c4..f1b328fb55613 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_ITERATIVE %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX942,GFX942_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_ITERATIVE %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_ITERATIVE %s
@@ -9,6 +10,7 @@
 ; RUN: llc -mtriple=amdgcn - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX7LESS,GFX7LESS_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX8,GFX8_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9_DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX942,GFX942_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064,GFX1064_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032,GFX1032_DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -mattr=-flat-for-global - -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164,GFX1164_DPP %s
@@ -105,6 +107,33 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: add_i32_constant:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    ; implicit-def: $vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB0_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX942-NEXT:    s_mul_i32 s2, s2, 5
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    ds_add_rtn_u32 v1, v1, v2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:  .LBB0_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mad_u32_u24 v0, v0, 5, s4
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: add_i32_constant:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
@@ -318,6 +347,36 @@ define amdgpu_kernel void @add_i32_uniform(ptr addrspace(1) %out, i32 %additive)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: add_i32_uniform:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    ; implicit-def: $vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB1_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_i32 s2, s6, s2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    ds_add_rtn_u32 v1, v1, v2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:  .LBB1_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mul_lo_u32 v0, s6, v0
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: add_i32_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_load_dword s6, s[4:5], 0x2c
@@ -571,6 +630,47 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: add_i32_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, 0
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
+; GFX942_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s3
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB2_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942_ITERATIVE-NEXT:    ds_add_rtn_u32 v1, v1, v2
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB2_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: add_i32_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
@@ -839,6 +939,51 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: add_i32_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr0
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB2_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942_DPP-NEXT:    ds_add_rtn_u32 v0, v3, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB2_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_add_u32_e32 v0, s4, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: add_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -1138,6 +1283,35 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX9_ITERATIVE-NEXT:  .LBB3_4:
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: add_i32_varying_nouse:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, 0
+; GFX942_ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
+; GFX942_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB3_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942_ITERATIVE-NEXT:    ds_add_u32 v0, v1
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB3_4:
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: add_i32_varying_nouse:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
@@ -1332,6 +1506,40 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX9_DPP-NEXT:  .LBB3_2:
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: add_i32_varying_nouse:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    s_mov_b32 s0, s2
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB3_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942_DPP-NEXT:    ds_add_u32 v2, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB3_2:
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: add_i32_varying_nouse:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -1556,6 +1764,36 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: add_i64_constant:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, s3, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB4_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX942-NEXT:    s_mul_i32 s2, s2, 5
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    ds_add_rtn_u64 v[0:1], v1, v[0:1]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:  .LBB4_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[4:5], v2, 5, v[0:1]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: add_i64_constant:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
@@ -1802,6 +2040,46 @@ define amdgpu_kernel void @add_i64_uniform(ptr addrspace(1) %out, i64 %additive)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: add_i64_uniform:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b64 s[6:7], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB5_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_i32 s7, s3, s6
+; GFX942-NEXT:    s_mul_hi_u32 s8, s2, s6
+; GFX942-NEXT:    s_add_i32 s8, s8, s7
+; GFX942-NEXT:    s_mul_i32 s6, s2, s6
+; GFX942-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-NEXT:    v_mov_b32_e32 v1, s8
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    ds_add_rtn_u64 v[0:1], v3, v[0:1]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:  .LBB5_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s4, s0
+; GFX942-NEXT:    s_mov_b32 s5, s1
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX942-NEXT:    s_mov_b32 s7, 0xf000
+; GFX942-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v2, v[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v4, v1
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v2, v[4:5]
+; GFX942-NEXT:    s_mov_b32 s6, -1
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: add_i64_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2101,6 +2379,52 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: add_i64_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942_ITERATIVE-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s6
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s8, v2, s6
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, m0
+; GFX942_ITERATIVE-NEXT:    s_add_u32 s0, s0, s8
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, m0
+; GFX942_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB6_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942_ITERATIVE-NEXT:    ds_add_rtn_u64 v[2:3], v4, v[2:3]
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB6_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    v_lshl_add_u64 v[0:1], s[4:5], 0, v[0:1]
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: add_i64_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
@@ -2464,6 +2788,79 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: add_i64_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB6_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942_DPP-NEXT:    ds_add_rtn_u64 v[0:1], v6, v[0:1]
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB6_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[0:1], s[4:5], 0, v[0:1]
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: add_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -2932,6 +3329,38 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX9_ITERATIVE-NEXT:  .LBB7_4:
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: add_i64_varying_nouse:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s4, s[2:3]
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s4
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s5, v1, s4
+; GFX942_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
+; GFX942_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB7_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942_ITERATIVE-NEXT:    ds_add_u64 v2, v[0:1]
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB7_4:
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: add_i64_varying_nouse:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
@@ -3195,6 +3624,66 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX9_DPP-NEXT:  .LBB7_2:
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: add_i64_varying_nouse:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v6
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v1, s[0:1]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB7_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942_DPP-NEXT:    ds_add_u64 v0, v[6:7]
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB7_2:
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: add_i64_varying_nouse:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -3503,6 +3992,34 @@ define amdgpu_kernel void @sub_i32_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: sub_i32_constant:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    ; implicit-def: $vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB8_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX942-NEXT:    s_mul_i32 s2, s2, 5
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    ds_sub_rtn_u32 v1, v1, v2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:  .LBB8_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    v_mul_u32_u24_e32 v0, 5, v0
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: sub_i32_constant:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
@@ -3722,6 +4239,36 @@ define amdgpu_kernel void @sub_i32_uniform(ptr addrspace(1) %out, i32 %subitive)
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: sub_i32_uniform:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    ; implicit-def: $vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB9_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_i32 s2, s6, s2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    ds_sub_rtn_u32 v1, v1, v2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:  .LBB9_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mul_lo_u32 v0, s6, v0
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: sub_i32_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_load_dword s6, s[4:5], 0x2c
@@ -3978,6 +4525,47 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: sub_i32_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, 0
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
+; GFX942_ITERATIVE-NEXT:  .LBB10_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s3
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB10_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942_ITERATIVE-NEXT:    ds_sub_rtn_u32 v1, v1, v2
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB10_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: sub_i32_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
@@ -4246,6 +4834,51 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: sub_i32_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr0
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB10_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942_DPP-NEXT:    ds_sub_rtn_u32 v0, v3, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB10_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: sub_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -4545,6 +5178,35 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX9_ITERATIVE-NEXT:  .LBB11_4:
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: sub_i32_varying_nouse:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, 0
+; GFX942_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
+; GFX942_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB11_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942_ITERATIVE-NEXT:    ds_sub_u32 v0, v1
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB11_4:
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: sub_i32_varying_nouse:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
@@ -4739,6 +5401,40 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX9_DPP-NEXT:  .LBB11_2:
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: sub_i32_varying_nouse:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_add_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    s_mov_b32 s0, s2
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB11_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942_DPP-NEXT:    ds_sub_u32 v2, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB11_2:
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: sub_i32_varying_nouse:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -4965,6 +5661,38 @@ define amdgpu_kernel void @sub_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: sub_i64_constant:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, s3, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB12_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s2, s[2:3]
+; GFX942-NEXT:    s_mul_i32 s2, s2, 5
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    ds_sub_rtn_u64 v[0:1], v1, v[0:1]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:  .LBB12_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    v_mul_u32_u24_e32 v0, 5, v2
+; GFX942-NEXT:    v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v0
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: sub_i64_constant:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
@@ -5226,6 +5954,46 @@ define amdgpu_kernel void @sub_i64_uniform(ptr addrspace(1) %out, i64 %subitive)
 ; GFX9-NEXT:    buffer_store_dwordx2 v[1:2], off, s[4:7], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: sub_i64_uniform:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b64 s[6:7], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s6, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, s7, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB13_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s6, s[6:7]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_i32 s7, s3, s6
+; GFX942-NEXT:    s_mul_hi_u32 s8, s2, s6
+; GFX942-NEXT:    s_add_i32 s8, s8, s7
+; GFX942-NEXT:    s_mul_i32 s6, s2, s6
+; GFX942-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-NEXT:    v_mov_b32_e32 v1, s8
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    ds_sub_rtn_u64 v[0:1], v3, v[0:1]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:  .LBB13_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s4, s0
+; GFX942-NEXT:    s_mov_b32 s5, s1
+; GFX942-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s9, v0
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], s2, v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v4, v1
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], s3, v2, v[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v1, s8
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, s9, v0
+; GFX942-NEXT:    s_mov_b32 s7, 0xf000
+; GFX942-NEXT:    s_mov_b32 s6, -1
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: sub_i64_uniform:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -5531,6 +6299,55 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: sub_i64_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942_ITERATIVE-NEXT:  .LBB14_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s6
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s8, v2, s6
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s6
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, m0
+; GFX942_ITERATIVE-NEXT:    s_add_u32 s0, s0, s8
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, m0
+; GFX942_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB14_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942_ITERATIVE-NEXT:    ds_sub_rtn_u64 v[2:3], v4, v[2:3]
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB14_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942_ITERATIVE-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v0
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    s_nop 0
+; GFX942_ITERATIVE-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v1, vcc
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: sub_i64_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
@@ -5894,6 +6711,81 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: sub_i64_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[0:1]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB14_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942_DPP-NEXT:    ds_sub_rtn_u64 v[0:1], v6, v[0:1]
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB14_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[0:1], v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, s4
+; GFX942_DPP-NEXT:    v_sub_co_u32_e32 v0, vcc, s5, v0
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_subb_co_u32_e32 v1, vcc, v6, v1, vcc
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: sub_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -6385,6 +7277,47 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: and_i32_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
+; GFX942_ITERATIVE-NEXT:  .LBB15_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s3
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB15_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942_ITERATIVE-NEXT:    ds_and_rtn_b32 v1, v1, v2
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB15_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: and_i32_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
@@ -6653,6 +7586,51 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: and_i32_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v1, -1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_and_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr0
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB15_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942_DPP-NEXT:    ds_and_rtn_b32 v0, v0, v3
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB15_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, s4, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: and_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -7007,6 +7985,52 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: and_i64_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s8
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, m0
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, m0
+; GFX942_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB16_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942_ITERATIVE-NEXT:    ds_and_rtn_b64 v[2:3], v4, v[2:3]
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB16_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v1, s4, v1
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v0, s5, v0
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: and_i64_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
@@ -7322,6 +8346,60 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: and_i64_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v5, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v6
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v1, -1, 0, s[0:1]
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v5, s[0:1]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v2, -1
+; GFX942_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v3, -1
+; GFX942_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_and_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_and_b32_dpp v4, v4, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_readlane_b32 s3, v1, 63
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v4, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v3, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v2, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB16_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[6:7], s[2:3]
+; GFX942_DPP-NEXT:    ds_and_rtn_b64 v[6:7], v0, v[6:7]
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB16_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v7
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[6:7], v[2:3]
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_and_b32_e32 v7, s4, v7
+; GFX942_DPP-NEXT:    v_and_b32_e32 v6, s5, v6
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dwordx2 v[6:7], off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: and_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -7744,6 +8822,47 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: or_i32_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, 0
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
+; GFX942_ITERATIVE-NEXT:  .LBB17_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s3
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB17_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942_ITERATIVE-NEXT:    ds_or_rtn_b32 v1, v1, v2
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB17_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    v_or_b32_e32 v0, s4, v0
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: or_i32_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
@@ -8012,6 +9131,51 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: or_i32_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr0
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB17_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942_DPP-NEXT:    ds_or_rtn_b32 v0, v3, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB17_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_or_b32_e32 v0, s4, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: or_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -8365,6 +9529,52 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: or_i64_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s8
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, m0
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, m0
+; GFX942_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB18_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942_ITERATIVE-NEXT:    ds_or_rtn_b64 v[2:3], v4, v[2:3]
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB18_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    v_or_b32_e32 v1, s4, v1
+; GFX942_ITERATIVE-NEXT:    v_or_b32_e32 v0, s5, v0
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: or_i64_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
@@ -8680,6 +9890,60 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: or_i64_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v6
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[0:1]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_or_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_or_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_readlane_b32 s3, v1, 63
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB18_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[6:7], s[2:3]
+; GFX942_DPP-NEXT:    ds_or_rtn_b64 v[6:7], v0, v[6:7]
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB18_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v7
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_or_b32_e32 v7, s4, v7
+; GFX942_DPP-NEXT:    v_or_b32_e32 v6, s5, v6
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dwordx2 v[6:7], off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: or_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -9102,6 +10366,47 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: xor_i32_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, 0
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
+; GFX942_ITERATIVE-NEXT:  .LBB19_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s3
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB19_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942_ITERATIVE-NEXT:    ds_xor_rtn_b32 v1, v1, v2
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB19_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: xor_i32_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
@@ -9370,6 +10675,51 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: xor_i32_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr0
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB19_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942_DPP-NEXT:    ds_xor_rtn_b32 v0, v3, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB19_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_xor_b32_e32 v0, s4, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: xor_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -9723,6 +11073,52 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: xor_i64_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942_ITERATIVE-NEXT:  .LBB20_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s8
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, m0
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, m0
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB20_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942_ITERATIVE-NEXT:    ds_xor_rtn_b64 v[2:3], v4, v[2:3]
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB20_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX942_ITERATIVE-NEXT:    v_xor_b32_e32 v0, s5, v0
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: xor_i64_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
@@ -10038,6 +11434,60 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: xor_i64_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v6, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v3, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v6, exec_hi, v6
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, 0, s[0:1]
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v3, s[0:1]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_xor_b32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_readlane_b32 s3, v1, 63
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v6
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB20_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[6:7], s[2:3]
+; GFX942_DPP-NEXT:    ds_xor_rtn_b64 v[6:7], v0, v[6:7]
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB20_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v7
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_xor_b32_e32 v7, s4, v7
+; GFX942_DPP-NEXT:    v_xor_b32_e32 v6, s5, v6
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dwordx2 v[6:7], off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: xor_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -10460,6 +11910,47 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: max_i32_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    s_brev_b32 s2, 1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
+; GFX942_ITERATIVE-NEXT:  .LBB21_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s3
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB21_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942_ITERATIVE-NEXT:    ds_max_rtn_i32 v1, v1, v2
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB21_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    v_max_i32_e32 v0, s4, v0
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: max_i32_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
@@ -10728,6 +12219,51 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: max_i32_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_max_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr0
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB21_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942_DPP-NEXT:    ds_max_rtn_i32 v0, v0, v3
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB21_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_max_i32_e32 v0, s4, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: max_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -11042,6 +12578,38 @@ define amdgpu_kernel void @max_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: max_i64_constant:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB22_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 5
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    ds_max_rtn_i64 v[0:1], v2, v[0:1]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:  .LBB22_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, s5
+; GFX942-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: max_i64_constant:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11343,6 +12911,62 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: max_i64_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_ITERATIVE-NEXT:    s_brev_b32 s1, 1
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s0, 0
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942_ITERATIVE-NEXT:  .LBB23_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s10, v2, s8
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s8
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
+; GFX942_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[0:1], v[4:5]
+; GFX942_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, m0
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, m0
+; GFX942_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
+; GFX942_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB23_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942_ITERATIVE-NEXT:    ds_max_rtn_i64 v[2:3], v4, v[2:3]
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB23_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s5
+; GFX942_ITERATIVE-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    s_nop 0
+; GFX942_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942_ITERATIVE-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: max_i64_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
@@ -11750,6 +13374,100 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: max_i64_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v9
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[0:1]
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v8, s[0:1]
+; GFX942_DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX942_DPP-NEXT:    v_readlane_b32 s3, v4, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v3, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB23_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942_DPP-NEXT:    ds_max_rtn_i64 v[8:9], v0, v[8:9]
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB23_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s5, v9
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942_DPP-NEXT:    v_cmp_gt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v9, v9, v0, vcc
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: max_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -12286,6 +14004,47 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: min_i32_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    s_brev_b32 s2, -2
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
+; GFX942_ITERATIVE-NEXT:  .LBB24_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s3
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB24_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942_ITERATIVE-NEXT:    ds_min_rtn_i32 v1, v1, v2
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB24_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    v_min_i32_e32 v0, s4, v0
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: min_i32_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
@@ -12554,6 +14313,51 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: min_i32_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_min_i32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr0
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB24_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942_DPP-NEXT:    ds_min_rtn_i32 v0, v0, v3
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB24_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_min_i32_e32 v0, s4, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: min_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -12868,6 +14672,38 @@ define amdgpu_kernel void @min_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: min_i64_constant:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB25_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 5
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    ds_min_rtn_i64 v[0:1], v2, v[0:1]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:  .LBB25_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    v_bfrev_b32_e32 v0, -2
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v0, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, s5
+; GFX942-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: min_i64_constant:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13169,6 +15005,62 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: min_i64_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_ITERATIVE-NEXT:    s_brev_b32 s1, -2
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s0, -1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942_ITERATIVE-NEXT:  .LBB26_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s10, v2, s8
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s8
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
+; GFX942_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5]
+; GFX942_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, m0
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, m0
+; GFX942_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
+; GFX942_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB26_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942_ITERATIVE-NEXT:    ds_min_rtn_i64 v[2:3], v4, v[2:3]
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB26_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s5
+; GFX942_ITERATIVE-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    s_nop 0
+; GFX942_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942_ITERATIVE-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: min_i64_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
@@ -13576,6 +15468,100 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[7:8], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: min_i64_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v9
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_bfrev_b32_e32 v3, -2
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v5, v3, 0, s[0:1]
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v8, s[0:1]
+; GFX942_DPP-NEXT:    v_bfrev_b32_e32 v7, -2
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v2, -1
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_bfrev_b32_e32 v7, -2
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_bfrev_b32_e32 v7, -2
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_bfrev_b32_e32 v7, -2
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_bfrev_b32_e32 v7, -2
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_bfrev_b32_e32 v7, -2
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX942_DPP-NEXT:    v_readlane_b32 s3, v4, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v3, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB26_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942_DPP-NEXT:    ds_min_rtn_i64 v[8:9], v0, v[8:9]
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB26_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s5, v9
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942_DPP-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v9, v9, v0, vcc
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: min_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -14112,6 +16098,47 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: umax_i32_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, 0
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
+; GFX942_ITERATIVE-NEXT:  .LBB27_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s3
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB27_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942_ITERATIVE-NEXT:    ds_max_rtn_u32 v1, v1, v2
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB27_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    v_max_u32_e32 v0, s4, v0
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: umax_i32_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
@@ -14380,6 +16407,51 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: umax_i32_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v1, 0, v0, s[0:1]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:4 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_shr:8 row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_max_u32_dpp v1, v1, v1 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr0
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB27_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942_DPP-NEXT:    ds_max_rtn_u32 v0, v3, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB27_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_max_u32_e32 v0, s4, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: umax_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -14690,6 +16762,37 @@ define amdgpu_kernel void @umax_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: umax_i64_constant:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB28_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 5
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    ds_max_rtn_u64 v[0:1], v2, v[0:1]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:  .LBB28_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 5, 0, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: umax_i64_constant:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14988,6 +17091,61 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: umax_i64_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942_ITERATIVE-NEXT:  .LBB29_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s10, v2, s8
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s8
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
+; GFX942_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[0:1], v[4:5]
+; GFX942_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, m0
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, m0
+; GFX942_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
+; GFX942_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB29_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942_ITERATIVE-NEXT:    ds_max_rtn_u64 v[2:3], v4, v[2:3]
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB29_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s5
+; GFX942_ITERATIVE-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    s_nop 0
+; GFX942_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942_ITERATIVE-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: umax_i64_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
@@ -15395,6 +17553,101 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[5:6], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: umax_i64_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v7, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v6, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v7, exec_hi, v7
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v3, 0, 0, s[0:1]
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v2, 0, v6, s[0:1]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v3 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, v[2:3], v[4:5]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v2, v5, v3, vcc
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s3, v2, 63
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v5, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v4, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB29_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[6:7], s[2:3]
+; GFX942_DPP-NEXT:    ds_max_rtn_u64 v[6:7], v0, v[6:7]
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB29_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942_DPP-NEXT:    v_cmp_gt_u64_e32 vcc, s[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v7, v7, v0, vcc
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v6, v6, v0, vcc
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dwordx2 v[6:7], off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: umax_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -15925,6 +18178,47 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: umin_i32_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
+; GFX942_ITERATIVE-NEXT:  .LBB30_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s3
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v1, exec_hi, v1
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB30_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942_ITERATIVE-NEXT:    ds_min_rtn_u32 v1, v1, v2
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB30_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    v_min_u32_e32 v0, s4, v0
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: umin_i32_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    s_mov_b64 s[0:1], exec
@@ -16193,6 +18487,51 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: umin_i32_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v3, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v3, exec_hi, v3
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v2, -1, v0, s[0:1]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v1, -1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_min_u32_dpp v2, v2, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v1, v2 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v3
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr0
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB30_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942_DPP-NEXT:    ds_min_rtn_u32 v0, v0, v3
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB30_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, v1
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_min_u32_e32 v0, s4, v0
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: umin_i32_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
@@ -16504,6 +18843,37 @@ define amdgpu_kernel void @umin_i64_constant(ptr addrspace(1) %out) {
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: umin_i64_constant:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB31_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 5
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    ds_min_rtn_u64 v[0:1], v2, v[0:1]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:  .LBB31_2:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 5, -1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, s5
+; GFX942-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: umin_i64_constant:
 ; GFX1064:       ; %bb.0: ; %entry
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16802,6 +19172,61 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    buffer_store_dwordx2 v[1:2], off, s[0:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_endpgm
 ;
+; GFX942_ITERATIVE-LABEL: umin_i64_varying:
+; GFX942_ITERATIVE:       ; %bb.0: ; %entry
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942_ITERATIVE-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942_ITERATIVE-NEXT:    s_mov_b64 s[0:1], -1
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942_ITERATIVE-NEXT:  .LBB32_1: ; %ComputeLoop
+; GFX942_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s9, v3, s8
+; GFX942_ITERATIVE-NEXT:    v_readlane_b32 s10, v2, s8
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 m0, s8
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v5, s9
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v4, s10
+; GFX942_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[4:5]
+; GFX942_ITERATIVE-NEXT:    s_and_b64 s[6:7], vcc, exec
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, m0
+; GFX942_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, m0
+; GFX942_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s9
+; GFX942_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
+; GFX942_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX942_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
+; GFX942_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX942_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
+; GFX942_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942_ITERATIVE-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942_ITERATIVE-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942_ITERATIVE-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942_ITERATIVE-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942_ITERATIVE-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_cbranch_execz .LBB32_4
+; GFX942_ITERATIVE-NEXT:  ; %bb.3:
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942_ITERATIVE-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942_ITERATIVE-NEXT:    ds_min_rtn_u64 v[2:3], v4, v[2:3]
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:  .LBB32_4:
+; GFX942_ITERATIVE-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942_ITERATIVE-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX942_ITERATIVE-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s5
+; GFX942_ITERATIVE-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1]
+; GFX942_ITERATIVE-NEXT:    s_mov_b32 s2, -1
+; GFX942_ITERATIVE-NEXT:    s_nop 0
+; GFX942_ITERATIVE-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX942_ITERATIVE-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942_ITERATIVE-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX942_ITERATIVE-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_ITERATIVE-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942_ITERATIVE-NEXT:    s_endpgm
+;
 ; GFX1064_ITERATIVE-LABEL: umin_i64_varying:
 ; GFX1064_ITERATIVE:       ; %bb.0: ; %entry
 ; GFX1064_ITERATIVE-NEXT:    v_mov_b32_e32 v3, 0
@@ -17207,6 +19632,100 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_DPP-NEXT:    buffer_store_dwordx2 v[6:7], off, s[0:3], 0
 ; GFX9_DPP-NEXT:    s_endpgm
 ;
+; GFX942_DPP-LABEL: umin_i64_varying:
+; GFX942_DPP:       ; %bb.0: ; %entry
+; GFX942_DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX942_DPP-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942_DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v9
+; GFX942_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v5, -1, 0, s[0:1]
+; GFX942_DPP-NEXT:    v_cndmask_b32_e64 v4, -1, v8, s[0:1]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, -1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v2, -1
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v3, -1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, -1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, -1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, -1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, -1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v7, -1
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v6, -1
+; GFX942_DPP-NEXT:    s_nop 0
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, v[4:5], v[6:7]
+; GFX942_DPP-NEXT:    s_nop 1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v1, v6, v4, vcc
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v4, v7, v5, vcc
+; GFX942_DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX942_DPP-NEXT:    v_readlane_b32 s3, v4, 63
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v3, v4 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    v_mov_b32_dpp v2, v1 wave_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942_DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942_DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX942_DPP-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX942_DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942_DPP-NEXT:    s_cbranch_execz .LBB32_2
+; GFX942_DPP-NEXT:  ; %bb.1:
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[8:9], s[2:3]
+; GFX942_DPP-NEXT:    ds_min_rtn_u64 v[8:9], v0, v[8:9]
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:  .LBB32_2:
+; GFX942_DPP-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942_DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s5, v9
+; GFX942_DPP-NEXT:    v_readfirstlane_b32 s4, v8
+; GFX942_DPP-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942_DPP-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX942_DPP-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942_DPP-NEXT:    s_mov_b32 s2, -1
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v9, v9, v0, vcc
+; GFX942_DPP-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942_DPP-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX942_DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942_DPP-NEXT:    buffer_store_dwordx2 v[8:9], off, s[0:3], 0
+; GFX942_DPP-NEXT:    s_endpgm
+;
 ; GFX1064_DPP-LABEL: umin_i64_varying:
 ; GFX1064_DPP:       ; %bb.0: ; %entry
 ; GFX1064_DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
diff --git a/llvm/test/CodeGen/AMDGPU/bypass-div.ll b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
index 3cf70c42390c2..a615f32ea436c 100644
--- a/llvm/test/CodeGen/AMDGPU/bypass-div.ll
+++ b/llvm/test/CodeGen/AMDGPU/bypass-div.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
 
 ; 64-bit divides and rems should be split into a fast and slow path
 ; where the fast path uses a 32-bit operation.
@@ -151,6 +152,172 @@ define i64 @sdiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sdiv64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_or_b32_e32 v7, v1, v3
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[2:3], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB0_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX942-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942-NEXT:    v_xor_b32_e32 v5, v3, v4
+; GFX942-NEXT:    v_xor_b32_e32 v14, v2, v4
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, v14
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, v5
+; GFX942-NEXT:    v_sub_co_u32_e32 v12, vcc, 0, v14
+; GFX942-NEXT:    v_mov_b32_e32 v11, v6
+; GFX942-NEXT:    v_fmamk_f32 v2, v3, 0x4f800000, v2
+; GFX942-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX942-NEXT:    v_subb_co_u32_e32 v13, vcc, 0, v5, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX942-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fmamk_f32 v2, v3, 0xcf800000, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v7, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v15, v3
+; GFX942-NEXT:    v_mul_lo_u32 v8, v13, v7
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v12, v7, 0
+; GFX942-NEXT:    v_mul_lo_u32 v9, v12, v15
+; GFX942-NEXT:    v_add3_u32 v3, v3, v9, v8
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v7, v3, 0
+; GFX942-NEXT:    v_mul_hi_u32 v10, v7, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[8:9], v[10:11], 0, v[8:9]
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v15, v3, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v15, v2, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v8, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v2, vcc, v9, v3, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, v6
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[10:11]
+; GFX942-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v2
+; GFX942-NEXT:    v_mul_lo_u32 v9, v13, v7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v3, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v8, v12, v15
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v12, v7, 0
+; GFX942-NEXT:    v_add3_u32 v3, v3, v8, v9
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v15, v3, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v15, v2, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v7, v3, 0
+; GFX942-NEXT:    v_mul_hi_u32 v2, v7, v2
+; GFX942-NEXT:    v_mov_b32_e32 v3, v6
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[12:13]
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v2, vcc, v3, v11, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, v6
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[8:9]
+; GFX942-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v2
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v10, vcc, v15, v3, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT:    v_xor_b32_e32 v12, v0, v2
+; GFX942-NEXT:    v_xor_b32_e32 v3, v1, v2
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v12, v10, 0
+; GFX942-NEXT:    v_mul_hi_u32 v8, v12, v7
+; GFX942-NEXT:    v_mov_b32_e32 v9, v6
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[8:9], 0, v[0:1]
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v3, v10, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v3, v7, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v10
+; GFX942-NEXT:    v_xor_b32_e32 v2, v2, v4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, v1, v11, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[8:9]
+; GFX942-NEXT:    v_mul_lo_u32 v8, v5, v0
+; GFX942-NEXT:    v_mul_lo_u32 v9, v14, v1
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v14, v0, 0
+; GFX942-NEXT:    v_add3_u32 v10, v7, v9, v8
+; GFX942-NEXT:    v_sub_u32_e32 v7, v3, v10
+; GFX942-NEXT:    v_sub_co_u32_e32 v11, vcc, v12, v6
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v7, v5, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v7, s[0:1], v11, v14
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v10, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[0:1], v6, v5
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[0:1], v7, v14
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], v6, v5
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v12, v8, v7, s[0:1]
+; GFX942-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, 2
+; GFX942-NEXT:    v_lshl_add_u64 v[8:9], v[0:1], 0, 1
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v12
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v14
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX942-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX942-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v2, vcc
+; GFX942-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:  .LBB0_2: ; %Flow
+; GFX942-NEXT:    s_andn2_saveexec_b64 s[0:1], s[2:3]
+; GFX942-NEXT:    s_cbranch_execz .LBB0_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v2
+; GFX942-NEXT:    v_sub_u32_e32 v3, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_mul_lo_u32 v3, v3, v1
+; GFX942-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX942-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX942-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX942-NEXT:    v_mul_lo_u32 v3, v1, v2
+; GFX942-NEXT:    v_sub_u32_e32 v0, v0, v3
+; GFX942-NEXT:    v_add_u32_e32 v4, 1, v1
+; GFX942-NEXT:    v_sub_u32_e32 v3, v0, v2
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX942-NEXT:    v_add_u32_e32 v3, 1, v1
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v1, v3, vcc
+; GFX942-NEXT:  .LBB0_4:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %d = sdiv i64 %a, %b
   ret i64 %d
 }
@@ -287,6 +454,156 @@ define i64 @udiv64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: udiv64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_or_b32_e32 v7, v1, v3
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[2:3], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB1_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; GFX942-NEXT:    v_sub_co_u32_e32 v13, vcc, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v11, v6
+; GFX942-NEXT:    v_fmamk_f32 v4, v5, 0x4f800000, v4
+; GFX942-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX942-NEXT:    v_subb_co_u32_e32 v14, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GFX942-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; GFX942-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-NEXT:    v_fmamk_f32 v4, v5, 0xcf800000, v4
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v7, v5
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v12, v4
+; GFX942-NEXT:    v_mul_lo_u32 v8, v13, v7
+; GFX942-NEXT:    v_mul_lo_u32 v9, v14, v12
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v13, v12, 0
+; GFX942-NEXT:    v_add3_u32 v5, v5, v8, v9
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v12, v5, 0
+; GFX942-NEXT:    v_mul_hi_u32 v10, v12, v4
+; GFX942-NEXT:    v_lshl_add_u64 v[8:9], v[10:11], 0, v[8:9]
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v7, v5, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v7, v4, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[10:11]
+; GFX942-NEXT:    v_add_co_u32_e32 v15, vcc, v12, v4
+; GFX942-NEXT:    v_mul_lo_u32 v9, v14, v15
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v8, v13, v7
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v13, v15, 0
+; GFX942-NEXT:    v_add3_u32 v5, v5, v8, v9
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v7, v5, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v7, v4, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v15, v5, 0
+; GFX942-NEXT:    v_mul_hi_u32 v4, v15, v4
+; GFX942-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[12:13]
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v11, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[8:9]
+; GFX942-NEXT:    v_add_co_u32_e32 v10, vcc, v15, v4
+; GFX942-NEXT:    v_mul_hi_u32 v8, v0, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, v7, 0
+; GFX942-NEXT:    v_mov_b32_e32 v9, v6
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[4:5]
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v1, v10, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v10
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v1, v7, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v11, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[8:9]
+; GFX942-NEXT:    v_mul_lo_u32 v8, v3, v4
+; GFX942-NEXT:    v_mul_lo_u32 v9, v2, v5
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v2, v4, 0
+; GFX942-NEXT:    v_add3_u32 v10, v7, v9, v8
+; GFX942-NEXT:    v_sub_u32_e32 v7, v1, v10
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v6
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v7, v3, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v7, s[0:1], v0, v2
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[0:1], v6, v3
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[0:1], v7, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], v6, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v11, v8, v7, s[0:1]
+; GFX942-NEXT:    v_lshl_add_u64 v[6:7], v[4:5], 0, 2
+; GFX942-NEXT:    v_lshl_add_u64 v[8:9], v[4:5], 0, 1
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX942-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v9, v0, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v8, v6, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:  .LBB1_2: ; %Flow
+; GFX942-NEXT:    s_andn2_saveexec_b64 s[0:1], s[2:3]
+; GFX942-NEXT:    s_cbranch_execz .LBB1_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v2
+; GFX942-NEXT:    v_sub_u32_e32 v3, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_mul_lo_u32 v3, v3, v1
+; GFX942-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX942-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX942-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX942-NEXT:    v_mul_lo_u32 v3, v1, v2
+; GFX942-NEXT:    v_sub_u32_e32 v0, v0, v3
+; GFX942-NEXT:    v_add_u32_e32 v4, 1, v1
+; GFX942-NEXT:    v_sub_u32_e32 v3, v0, v2
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX942-NEXT:    v_add_u32_e32 v3, 1, v1
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v1, v3, vcc
+; GFX942-NEXT:  .LBB1_4:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %d = udiv i64 %a, %b
   ret i64 %d
 }
@@ -434,6 +751,169 @@ define i64 @srem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: srem64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_or_b32_e32 v7, v1, v3
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[4:5], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB2_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX942-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942-NEXT:    v_xor_b32_e32 v7, v3, v4
+; GFX942-NEXT:    v_xor_b32_e32 v12, v2, v4
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, v12
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, v7
+; GFX942-NEXT:    v_sub_co_u32_e32 v11, vcc, 0, v12
+; GFX942-NEXT:    v_mov_b32_e32 v9, v6
+; GFX942-NEXT:    v_fmamk_f32 v2, v3, 0x4f800000, v2
+; GFX942-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX942-NEXT:    v_subb_co_u32_e32 v13, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX942-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fmamk_f32 v2, v3, 0xcf800000, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v10, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v14, v3
+; GFX942-NEXT:    v_mul_lo_u32 v4, v13, v10
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v11, v10, 0
+; GFX942-NEXT:    v_mul_lo_u32 v5, v11, v14
+; GFX942-NEXT:    v_add3_u32 v3, v3, v5, v4
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v10, v3, 0
+; GFX942-NEXT:    v_mul_hi_u32 v8, v10, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[4:5]
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v14, v3, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v14, v2, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v3, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, v6
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[8:9]
+; GFX942-NEXT:    v_add_co_u32_e32 v15, vcc, v10, v2
+; GFX942-NEXT:    v_mul_lo_u32 v5, v13, v15
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v14, vcc, v14, v3, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v4, v11, v14
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v11, v15, 0
+; GFX942-NEXT:    v_add3_u32 v3, v3, v4, v5
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v14, v3, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v14, v2, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v15, v3, 0
+; GFX942-NEXT:    v_mul_hi_u32 v2, v15, v2
+; GFX942-NEXT:    v_mov_b32_e32 v3, v6
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[10:11]
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v8
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v2, vcc, v3, v9, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, v6
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942-NEXT:    v_add_co_u32_e32 v8, vcc, v15, v2
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, v14, v3, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT:    v_xor_b32_e32 v10, v0, v2
+; GFX942-NEXT:    v_xor_b32_e32 v3, v1, v2
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v10, v9, 0
+; GFX942-NEXT:    v_mul_hi_u32 v4, v10, v8
+; GFX942-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1]
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v3, v9, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v3, v8, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, v1, v9, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[4:5]
+; GFX942-NEXT:    v_mul_lo_u32 v4, v7, v0
+; GFX942-NEXT:    v_mul_lo_u32 v5, v12, v1
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v12, v0, 0
+; GFX942-NEXT:    v_add3_u32 v1, v1, v5, v4
+; GFX942-NEXT:    v_sub_u32_e32 v4, v3, v1
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v10, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v7, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v5, s[0:1], v0, v12
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v4, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v6, v7
+; GFX942-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v7, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v12
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[2:3], v6, v7
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v12
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
+; GFX942-NEXT:    v_sub_co_u32_e64 v9, s[0:1], v5, v12
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v7
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v5, v9, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX942-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX942-NEXT:    v_sub_co_u32_e32 v4, vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v5, vcc, v1, v2, vcc
+; GFX942-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:  .LBB2_2: ; %Flow
+; GFX942-NEXT:    s_andn2_saveexec_b64 s[0:1], s[4:5]
+; GFX942-NEXT:    s_cbranch_execz .LBB2_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v2
+; GFX942-NEXT:    v_sub_u32_e32 v3, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_mul_lo_u32 v3, v3, v1
+; GFX942-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX942-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX942-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX942-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX942-NEXT:    v_sub_u32_e32 v0, v0, v1
+; GFX942-NEXT:    v_sub_u32_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX942-NEXT:    v_sub_u32_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX942-NEXT:  .LBB2_4:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %d = srem i64 %a, %b
   ret i64 %d
 }
@@ -567,6 +1047,153 @@ define i64 @urem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: urem64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_or_b32_e32 v7, v1, v3
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[4:5], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB3_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; GFX942-NEXT:    v_sub_co_u32_e32 v13, vcc, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v11, v6
+; GFX942-NEXT:    v_fmamk_f32 v4, v5, 0x4f800000, v4
+; GFX942-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX942-NEXT:    v_subb_co_u32_e32 v14, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GFX942-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; GFX942-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-NEXT:    v_fmamk_f32 v4, v5, 0xcf800000, v4
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v7, v5
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v12, v4
+; GFX942-NEXT:    v_mul_lo_u32 v8, v13, v7
+; GFX942-NEXT:    v_mul_lo_u32 v9, v14, v12
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v13, v12, 0
+; GFX942-NEXT:    v_add3_u32 v5, v5, v8, v9
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v12, v5, 0
+; GFX942-NEXT:    v_mul_hi_u32 v10, v12, v4
+; GFX942-NEXT:    v_lshl_add_u64 v[8:9], v[10:11], 0, v[8:9]
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v7, v5, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v7, v4, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v8, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v9, v5, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[10:11]
+; GFX942-NEXT:    v_add_co_u32_e32 v15, vcc, v12, v4
+; GFX942-NEXT:    v_mul_lo_u32 v9, v14, v15
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v8, v13, v7
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v13, v15, 0
+; GFX942-NEXT:    v_add3_u32 v5, v5, v8, v9
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v7, v5, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v7, v4, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v15, v5, 0
+; GFX942-NEXT:    v_mul_hi_u32 v4, v15, v4
+; GFX942-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[12:13]
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v11, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[8:9]
+; GFX942-NEXT:    v_add_co_u32_e32 v10, vcc, v15, v4
+; GFX942-NEXT:    v_mul_hi_u32 v8, v0, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, v7, 0
+; GFX942-NEXT:    v_mov_b32_e32 v9, v6
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[4:5]
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v1, v10, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v10
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v1, v7, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v11, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[8:9]
+; GFX942-NEXT:    v_mul_lo_u32 v6, v3, v4
+; GFX942-NEXT:    v_mul_lo_u32 v7, v2, v5
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, v4, 0
+; GFX942-NEXT:    v_add3_u32 v5, v5, v7, v6
+; GFX942-NEXT:    v_sub_u32_e32 v6, v1, v5
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v6, v3, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v6, s[0:1], v0, v2
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v7, s[2:3], 0, v4, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v7, v3
+; GFX942-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v3, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v6, v2
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[2:3], v7, v3
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
+; GFX942-NEXT:    v_sub_co_u32_e64 v9, s[0:1], v6, v2
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, v7, v4, s[0:1]
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v1, v4, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v6, v9, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:  .LBB3_2: ; %Flow
+; GFX942-NEXT:    s_andn2_saveexec_b64 s[0:1], s[4:5]
+; GFX942-NEXT:    s_cbranch_execz .LBB3_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v2
+; GFX942-NEXT:    v_sub_u32_e32 v3, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_mul_lo_u32 v3, v3, v1
+; GFX942-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX942-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX942-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX942-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX942-NEXT:    v_sub_u32_e32 v0, v0, v1
+; GFX942-NEXT:    v_sub_u32_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX942-NEXT:    v_sub_u32_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
+; GFX942-NEXT:  .LBB3_4:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %d = urem i64 %a, %b
   ret i64 %d
 }
@@ -603,6 +1230,40 @@ define i32 @sdiv32(i32 %a, i32 %b) {
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sdiv32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_sub_u32_e32 v2, 0, v1
+; GFX942-NEXT:    v_max_i32_e32 v2, v1, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, v2
+; GFX942-NEXT:    v_sub_u32_e32 v4, 0, v0
+; GFX942-NEXT:    v_xor_b32_e32 v1, v0, v1
+; GFX942-NEXT:    v_max_i32_e32 v0, v0, v4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v3
+; GFX942-NEXT:    v_sub_u32_e32 v4, 0, v2
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX942-NEXT:    v_mul_f32_e32 v3, 0x4f7ffffe, v3
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX942-NEXT:    v_mul_lo_u32 v4, v4, v3
+; GFX942-NEXT:    v_mul_hi_u32 v4, v3, v4
+; GFX942-NEXT:    v_add_u32_e32 v3, v3, v4
+; GFX942-NEXT:    v_mul_hi_u32 v3, v0, v3
+; GFX942-NEXT:    v_mul_lo_u32 v4, v3, v2
+; GFX942-NEXT:    v_sub_u32_e32 v0, v0, v4
+; GFX942-NEXT:    v_add_u32_e32 v5, 1, v3
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    v_sub_u32_e32 v4, v0, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX942-NEXT:    v_add_u32_e32 v4, 1, v3
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v4, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v0, v0, v1
+; GFX942-NEXT:    v_sub_u32_e32 v0, v0, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %d = sdiv i32 %a, %b
   ret i32 %d
 }
@@ -631,6 +1292,33 @@ define i32 @udiv32(i32 %a, i32 %b) {
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: udiv32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; GFX942-NEXT:    v_sub_u32_e32 v3, 0, v1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_mul_lo_u32 v3, v3, v2
+; GFX942-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX942-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX942-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX942-NEXT:    v_mul_lo_u32 v3, v2, v1
+; GFX942-NEXT:    v_sub_u32_e32 v0, v0, v3
+; GFX942-NEXT:    v_add_u32_e32 v4, 1, v2
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    v_sub_u32_e32 v3, v0, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX942-NEXT:    v_add_u32_e32 v3, 1, v2
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %d = udiv i32 %a, %b
   ret i32 %d
 }
@@ -664,6 +1352,37 @@ define i32 @srem32(i32 %a, i32 %b) {
 ; GFX9-NEXT:    v_xor_b32_e32 v1, v1, v0
 ; GFX9-NEXT:    v_sub_u32_e32 v0, v1, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: srem32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_sub_u32_e32 v2, 0, v1
+; GFX942-NEXT:    v_max_i32_e32 v1, v1, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; GFX942-NEXT:    v_sub_u32_e32 v4, 0, v0
+; GFX942-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX942-NEXT:    v_max_i32_e32 v0, v0, v4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX942-NEXT:    v_sub_u32_e32 v4, 0, v1
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_mul_lo_u32 v4, v4, v2
+; GFX942-NEXT:    v_mul_hi_u32 v4, v2, v4
+; GFX942-NEXT:    v_add_u32_e32 v2, v2, v4
+; GFX942-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX942-NEXT:    v_mul_lo_u32 v2, v2, v1
+; GFX942-NEXT:    v_sub_u32_e32 v0, v0, v2
+; GFX942-NEXT:    v_sub_u32_e32 v2, v0, v1
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX942-NEXT:    v_sub_u32_e32 v2, v0, v1
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX942-NEXT:    v_sub_u32_e32 v0, v0, v3
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %d = srem i32 %a, %b
   ret i32 %d
 }
@@ -690,6 +1409,31 @@ define i32 @urem32(i32 %a, i32 %b) {
 ; GFX9-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: urem32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, v1
+; GFX942-NEXT:    v_sub_u32_e32 v3, 0, v1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_mul_lo_u32 v3, v3, v2
+; GFX942-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX942-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX942-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX942-NEXT:    v_mul_lo_u32 v2, v2, v1
+; GFX942-NEXT:    v_sub_u32_e32 v0, v0, v2
+; GFX942-NEXT:    v_sub_u32_e32 v2, v0, v1
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX942-NEXT:    v_sub_u32_e32 v2, v0, v1
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %d = urem i32 %a, %b
   ret i32 %d
 }
@@ -858,6 +1602,189 @@ define <2 x i64> @sdivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sdivrem64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_or_b32_e32 v9, v1, v3
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX942-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX942-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[6:7], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB8_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX942-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942-NEXT:    v_xor_b32_e32 v14, v3, v4
+; GFX942-NEXT:    v_xor_b32_e32 v15, v2, v4
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, v15
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, v14
+; GFX942-NEXT:    v_sub_co_u32_e32 v9, vcc, 0, v15
+; GFX942-NEXT:    v_mov_b32_e32 v11, v8
+; GFX942-NEXT:    v_fmamk_f32 v2, v3, 0x4f800000, v2
+; GFX942-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX942-NEXT:    v_subb_co_u32_e32 v12, vcc, 0, v14, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX942-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fmamk_f32 v2, v3, 0xcf800000, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v5, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v13, v3
+; GFX942-NEXT:    v_mul_lo_u32 v6, v12, v5
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v9, v5, 0
+; GFX942-NEXT:    v_mul_lo_u32 v7, v9, v13
+; GFX942-NEXT:    v_add3_u32 v3, v3, v7, v6
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v5, v3, 0
+; GFX942-NEXT:    v_mul_hi_u32 v10, v5, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[6:7], v[10:11], 0, v[6:7]
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v13, v3, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v13, v2, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v3, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, v8
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[10:11]
+; GFX942-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v2
+; GFX942-NEXT:    v_mul_lo_u32 v7, v12, v5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v16, vcc, v13, v3, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v6, v9, v16
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v9, v5, 0
+; GFX942-NEXT:    v_add3_u32 v3, v3, v6, v7
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v16, v3, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v16, v2, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v5, v3, 0
+; GFX942-NEXT:    v_mul_hi_u32 v2, v5, v2
+; GFX942-NEXT:    v_mov_b32_e32 v3, v8
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[12:13]
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v2, vcc, v3, v11, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, v8
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7]
+; GFX942-NEXT:    v_add_co_u32_e32 v5, vcc, v5, v2
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, v16, v3, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT:    v_xor_b32_e32 v12, v0, v2
+; GFX942-NEXT:    v_xor_b32_e32 v3, v1, v2
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v12, v9, 0
+; GFX942-NEXT:    v_mul_hi_u32 v6, v12, v5
+; GFX942-NEXT:    v_mov_b32_e32 v7, v8
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[6:7], 0, v[0:1]
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v3, v5, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v10
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v3, v9, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, v1, v11, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v1, v8
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[6:7]
+; GFX942-NEXT:    v_mul_lo_u32 v5, v14, v0
+; GFX942-NEXT:    v_mul_lo_u32 v8, v15, v1
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v15, v0, 0
+; GFX942-NEXT:    v_add3_u32 v5, v7, v8, v5
+; GFX942-NEXT:    v_sub_u32_e32 v7, v3, v5
+; GFX942-NEXT:    v_sub_co_u32_e32 v10, vcc, v12, v6
+; GFX942-NEXT:    v_lshl_add_u64 v[8:9], v[0:1], 0, 1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e64 v11, s[0:1], v7, v14, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v12, s[0:1], v10, v15
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v13, s[2:3], 0, v11, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v13, v14
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v14
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v12, v15
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v15
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[2:3], v13, v14
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v16, v6, v7, s[2:3]
+; GFX942-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, 2
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v16
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v14
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v5, v9, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, v8, v6, s[2:3]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v5, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v5, v2, v4
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v0, v0, v5
+; GFX942-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX942-NEXT:    v_sub_co_u32_e64 v4, s[4:5], v0, v5
+; GFX942-NEXT:    v_subb_co_u32_e64 v0, s[0:1], v11, v14, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e64 v5, s[4:5], v1, v5, s[4:5]
+; GFX942-NEXT:    v_sub_co_u32_e64 v1, s[0:1], v12, v15
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v12, v1, s[2:3]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v0, s[0:1], 0, v0, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v13, v0, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v10, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX942-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX942-NEXT:    v_sub_co_u32_e32 v6, vcc, v1, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v7, vcc, v0, v2, vcc
+; GFX942-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:  .LBB8_2: ; %Flow
+; GFX942-NEXT:    s_andn2_saveexec_b64 s[0:1], s[6:7]
+; GFX942-NEXT:    s_cbranch_execz .LBB8_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v2
+; GFX942-NEXT:    v_sub_u32_e32 v3, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_mul_lo_u32 v3, v3, v1
+; GFX942-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX942-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX942-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX942-NEXT:    v_mul_lo_u32 v3, v1, v2
+; GFX942-NEXT:    v_sub_u32_e32 v0, v0, v3
+; GFX942-NEXT:    v_add_u32_e32 v4, 1, v1
+; GFX942-NEXT:    v_sub_u32_e32 v3, v0, v2
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX942-NEXT:    v_sub_u32_e32 v3, v0, v2
+; GFX942-NEXT:    v_add_u32_e32 v4, 1, v1
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v6, v0, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc
+; GFX942-NEXT:  .LBB8_4:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b32_e32 v2, v6
+; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %d = sdiv i64 %a, %b
   %r = srem i64 %a, %b
   %ins.0 = insertelement <2 x i64> poison, i64 %d, i32 0
@@ -1010,6 +1937,169 @@ define <2 x i64> @udivrem64(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v2, v6
 ; GFX9-NEXT:    v_mov_b32_e32 v3, v7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: udivrem64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_or_b32_e32 v9, v1, v3
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX942-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX942-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[4:5], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB9_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; GFX942-NEXT:    v_sub_co_u32_e32 v13, vcc, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v11, v8
+; GFX942-NEXT:    v_fmamk_f32 v4, v5, 0x4f800000, v4
+; GFX942-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX942-NEXT:    v_subb_co_u32_e32 v14, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GFX942-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; GFX942-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-NEXT:    v_fmamk_f32 v4, v5, 0xcf800000, v4
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v9, v5
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v12, v4
+; GFX942-NEXT:    v_mul_lo_u32 v6, v13, v9
+; GFX942-NEXT:    v_mul_lo_u32 v7, v14, v12
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v13, v12, 0
+; GFX942-NEXT:    v_add3_u32 v5, v5, v6, v7
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v12, v5, 0
+; GFX942-NEXT:    v_mul_hi_u32 v10, v12, v4
+; GFX942-NEXT:    v_lshl_add_u64 v[6:7], v[10:11], 0, v[6:7]
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v9, v5, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v9, v4, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v7, v5, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, v8
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[10:11]
+; GFX942-NEXT:    v_add_co_u32_e32 v15, vcc, v12, v4
+; GFX942-NEXT:    v_mul_lo_u32 v7, v14, v15
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v5, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v6, v13, v9
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v13, v15, 0
+; GFX942-NEXT:    v_add3_u32 v5, v5, v6, v7
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v9, v5, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v9, v4, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v15, v5, 0
+; GFX942-NEXT:    v_mul_hi_u32 v4, v15, v4
+; GFX942-NEXT:    v_mov_b32_e32 v5, v8
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[12:13]
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v11, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, v8
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX942-NEXT:    v_add_co_u32_e32 v10, vcc, v15, v4
+; GFX942-NEXT:    v_mul_hi_u32 v6, v0, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v5, vcc
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, v9, 0
+; GFX942-NEXT:    v_mov_b32_e32 v7, v8
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[6:7], 0, v[4:5]
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v1, v10, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v10
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v1, v9, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v11, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, v8
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX942-NEXT:    v_mul_lo_u32 v8, v3, v4
+; GFX942-NEXT:    v_mul_lo_u32 v9, v2, v5
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v2, v4, 0
+; GFX942-NEXT:    v_add3_u32 v10, v7, v9, v8
+; GFX942-NEXT:    v_sub_u32_e32 v7, v1, v10
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v6
+; GFX942-NEXT:    v_lshl_add_u64 v[8:9], v[4:5], 0, 1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e64 v11, s[0:1], v7, v3, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v12, s[0:1], v0, v2
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v13, s[2:3], 0, v11, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v13, v3
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v12, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[2:3], v13, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v14, v6, v7, s[2:3]
+; GFX942-NEXT:    v_lshl_add_u64 v[6:7], v[4:5], 0, 2
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v14
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[2:3]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX942-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v11, v3, s[0:1]
+; GFX942-NEXT:    v_sub_co_u32_e64 v2, s[0:1], v12, v2
+; GFX942-NEXT:    v_cndmask_b32_e32 v9, v9, v10, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v9
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v13, v3, s[2:3]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v5, v7, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v12, v2, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v4, v6, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v6, v0, v1, vcc
+; GFX942-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:  .LBB9_2: ; %Flow
+; GFX942-NEXT:    s_andn2_saveexec_b64 s[0:1], s[4:5]
+; GFX942-NEXT:    s_cbranch_execz .LBB9_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v2
+; GFX942-NEXT:    v_sub_u32_e32 v3, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_mul_lo_u32 v3, v3, v1
+; GFX942-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX942-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX942-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX942-NEXT:    v_mul_lo_u32 v3, v1, v2
+; GFX942-NEXT:    v_sub_u32_e32 v0, v0, v3
+; GFX942-NEXT:    v_add_u32_e32 v4, 1, v1
+; GFX942-NEXT:    v_sub_u32_e32 v3, v0, v2
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX942-NEXT:    v_sub_u32_e32 v3, v0, v2
+; GFX942-NEXT:    v_add_u32_e32 v4, 1, v1
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v6, v0, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc
+; GFX942-NEXT:  .LBB9_4:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    v_mov_b32_e32 v2, v6
+; GFX942-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %d = udiv i64 %a, %b
   %r = urem i64 %a, %b
   %ins.0 = insertelement <2 x i64> poison, i64 %d, i32 0
@@ -1153,6 +2243,158 @@ define i64 @sdiv64_known32(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, v4
 ; GFX9-NEXT:    v_mov_b32_e32 v1, v5
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sdiv64_known32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v9, 31, v1
+; GFX942-NEXT:    v_ashrrev_i32_e32 v7, 31, v3
+; GFX942-NEXT:    v_or_b32_e32 v11, v9, v7
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    v_mov_b32_e32 v0, v3
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[2:3], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB10_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v0
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, v7
+; GFX942-NEXT:    v_sub_co_u32_e32 v6, vcc, 0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v15, v10
+; GFX942-NEXT:    v_fmamk_f32 v1, v3, 0x4f800000, v1
+; GFX942-NEXT:    v_rcp_f32_e32 v1, v1
+; GFX942-NEXT:    v_subb_co_u32_e32 v8, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX942-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fmamk_f32 v1, v3, 0xcf800000, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_mul_lo_u32 v11, v6, v3
+; GFX942-NEXT:    v_mul_lo_u32 v12, v8, v1
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v6, v1, 0
+; GFX942-NEXT:    v_add3_u32 v5, v5, v11, v12
+; GFX942-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v1, v5, 0
+; GFX942-NEXT:    v_mul_hi_u32 v14, v1, v4
+; GFX942-NEXT:    v_lshl_add_u64 v[12:13], v[14:15], 0, v[12:13]
+; GFX942-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v3, v5, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v3, v4, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v12, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v13, v5, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[14:15]
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v4
+; GFX942-NEXT:    v_mul_lo_u32 v8, v8, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v11, v6, v3
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v6, v1, 0
+; GFX942-NEXT:    v_add3_u32 v5, v5, v11, v8
+; GFX942-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v3, v5, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v3, v4, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v1, v5, 0
+; GFX942-NEXT:    v_mul_hi_u32 v4, v1, v4
+; GFX942-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[16:17]
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v14
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v15, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[12:13]
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v4
+; GFX942-NEXT:    v_mul_hi_u32 v12, v2, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, v3, 0
+; GFX942-NEXT:    v_mov_b32_e32 v13, v10
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[12:13], 0, v[4:5]
+; GFX942-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v9, v1, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, v4, v14
+; GFX942-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v9, v3, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v15, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[12:13]
+; GFX942-NEXT:    v_mul_lo_u32 v1, v7, v4
+; GFX942-NEXT:    v_mul_lo_u32 v3, v0, v5
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v0, v4, 0
+; GFX942-NEXT:    v_add3_u32 v1, v11, v3, v1
+; GFX942-NEXT:    v_sub_u32_e32 v3, v9, v1
+; GFX942-NEXT:    v_sub_co_u32_e32 v6, vcc, v2, v10
+; GFX942-NEXT:    v_lshl_add_u64 v[10:11], v[4:5], 0, 1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v3, v7, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v3, s[0:1], v6, v0
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v9, v1, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v2, s[0:1], 0, v2, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[0:1], v2, v7
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[0:1], v3, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], v2, v7
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, v8, v3, s[0:1]
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v6, v0
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[4:5], 0, 2
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v11, v3, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v7
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v10, v2, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v5, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v4, v0, vcc
+; GFX942-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:  .LBB10_2: ; %Flow
+; GFX942-NEXT:    s_andn2_saveexec_b64 s[0:1], s[2:3]
+; GFX942-NEXT:    s_cbranch_execz .LBB10_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, v3
+; GFX942-NEXT:    v_sub_u32_e32 v2, 0, v3
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_mul_lo_u32 v2, v2, v0
+; GFX942-NEXT:    v_mul_hi_u32 v2, v0, v2
+; GFX942-NEXT:    v_add_u32_e32 v0, v0, v2
+; GFX942-NEXT:    v_mul_hi_u32 v0, v1, v0
+; GFX942-NEXT:    v_mul_lo_u32 v2, v0, v3
+; GFX942-NEXT:    v_sub_u32_e32 v1, v1, v2
+; GFX942-NEXT:    v_add_u32_e32 v4, 1, v0
+; GFX942-NEXT:    v_sub_u32_e32 v2, v1, v3
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX942-NEXT:    v_add_u32_e32 v2, 1, v0
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
+; GFX942-NEXT:  .LBB10_4:
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %a.ext = ashr i64 %a, 32
   %b.ext = ashr i64 %b, 32
   %d = udiv i64 %a.ext, %b.ext
@@ -1184,6 +2426,34 @@ define i64 @udiv64_known32(i64 %a, i64 %b) {
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: udiv64_known32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v2
+; GFX942-NEXT:    v_sub_u32_e32 v3, 0, v2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_mul_lo_u32 v3, v3, v1
+; GFX942-NEXT:    v_mul_hi_u32 v3, v1, v3
+; GFX942-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX942-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX942-NEXT:    v_mul_lo_u32 v3, v1, v2
+; GFX942-NEXT:    v_sub_u32_e32 v0, v0, v3
+; GFX942-NEXT:    v_add_u32_e32 v4, 1, v1
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    v_sub_u32_e32 v3, v0, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX942-NEXT:    v_add_u32_e32 v3, 1, v1
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v1, v3, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %a.mask = and i64 %a, 4294967295
   %b.mask = and i64 %b, 4294967295
   %d = udiv i64 %a.mask, %b.mask
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
index f26b72027a784..c749799b28fbc 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-fmul-sel.ll
@@ -3,6 +3,8 @@
 ; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck -check-prefixes=GFX7,GFX7-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
 ; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942,GFX942-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -global-isel-abort=2 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck -check-prefixes=GFX10,GFX10-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s
@@ -27,6 +29,15 @@ define float @fmul_select_f32_test1(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_f32_test1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f32_test1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -66,6 +77,15 @@ define float @fmul_select_f32_test2(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_f32_test2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, 1.0, 0.5, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f32_test2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -133,6 +153,30 @@ define <2 x float> @fmul_select_v2f32_test3(<2 x float> %x, <2 x i32> %bool.arg1
 ; GFX9-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_v2f32_test3:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_v2f32_test3:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 2.0, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 2.0, vcc
+; GFX942-GISEL-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_v2f32_test3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -205,6 +249,30 @@ define <2 x float> @fmul_select_v2f32_test4(<2 x float> %x, <2 x i32> %bool.arg1
 ; GFX9-GISEL-NEXT:    v_mul_f32_e32 v1, v1, v3
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_v2f32_test4:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX942-SDAG-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_v2f32_test4:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v2, 1.0, 0.5, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v3, 1.0, 0.5, vcc
+; GFX942-GISEL-NEXT:    v_pk_mul_f32 v[0:1], v[0:1], v[2:3]
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_v2f32_test4:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -249,6 +317,15 @@ define float @fmul_select_f32_test5(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_f32_test5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, -1.0, -2.0, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f32_test5:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -312,6 +389,28 @@ define float @fmul_select_f32_test6(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_f32_test6:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0x41000000
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc0400000
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX942-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_f32_test6:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc0400000
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0x41000000
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX942-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: fmul_select_f32_test6:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -374,6 +473,16 @@ define float @fmul_select_f32_test7_sel_log2val_pos59_pos92(float %x, i32 %bool.
 ; GFX9-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x5c
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v3, 59, vcc
+; GFX942-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f32_test7_sel_log2val_pos59_pos92:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -437,6 +546,28 @@ define float @fmul_select_f32_test8(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_f32_test8:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc1000000
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 0x41800000
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX942-SDAG-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_f32_test8:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0x41800000
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc1000000
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX942-GISEL-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: fmul_select_f32_test8:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -497,6 +628,15 @@ define float @fmul_select_f32_test9(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_f32_test9:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, 2.0, 0, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f32_test9:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -538,6 +678,16 @@ define float @fmul_select_f32_test10(float %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_f32_test10:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f32_test10:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -579,6 +729,16 @@ define float @fmul_select_f32_test11_sel_log2val_pos78_pos56(float %x, i32 %bool
 ; GFX9-NEXT:    v_ldexp_f32 v0, -v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x4e
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 56, v3, vcc
+; GFX942-NEXT:    v_ldexp_f32 v0, -v0, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f32_test11_sel_log2val_pos78_pos56:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -642,6 +802,28 @@ define float @fmul_select_f32_test12_sel_log2val_neg48_pos68(float %x, i32 %bool
 ; GFX9-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0x44
+; GFX942-SDAG-NEXT:    v_not_b32_e32 v4, 47
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX942-SDAG-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_not_b32_e32 v3, 47
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0x44
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX942-GISEL-NEXT:    v_ldexp_f32 v0, v0, v1
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: fmul_select_f32_test12_sel_log2val_neg48_pos68:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -702,6 +884,15 @@ define double @fmul_select_f64_test1(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_f64_test1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX942-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f64_test1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -741,6 +932,15 @@ define double @fmul_select_f64_test2(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_f64_test2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX942-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f64_test2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -786,6 +986,32 @@ define <2 x double> @fmul_select_v2f64_test3(<2 x double> %x, <2 x i32> %bool.ar
 ; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_v2f64_test3:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_v2f64_test3:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_v2f64_test3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -837,6 +1063,32 @@ define <2 x double> @fmul_select_v2f64_test4(<2 x double> %x, <2 x i32> %bool.ar
 ; GFX9-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_v2f64_test4:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX942-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_v2f64_test4:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX942-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v5
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_v2f64_test4:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -882,6 +1134,15 @@ define double @fmul_select_f64_test5(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_f64_test5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX942-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f64_test5:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -921,6 +1182,15 @@ define double @fmul_select_f64_test6(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_f64_test6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX942-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f64_test6:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -984,6 +1254,28 @@ define double @fmul_select_f64_test7(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_f64_test7:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 0xbff00000
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, v4, 2.0, vcc
+; GFX942-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_f64_test7:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, 0xbff00000
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, 2.0, vcc
+; GFX942-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f64_test7:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1025,6 +1317,15 @@ define double @fmul_select_f64_test8(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_f64_test8:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 5, 2, vcc
+; GFX942-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v2
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f64_test8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1070,6 +1371,32 @@ define <2 x double> @fmul_select_v2f64_test9(<2 x double> %x, <2 x i32> %bool.ar
 ; GFX9-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_v2f64_test9:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-SDAG-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v4
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_v2f64_test9:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-GISEL-NEXT:    v_ldexp_f64 v[0:1], -v[0:1], v4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_ldexp_f64 v[2:3], -v[2:3], v5
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_v2f64_test9:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1155,6 +1482,37 @@ define <2 x double> @fmul_select_v2f64_test10(<2 x double> %x, <2 x i32> %bool.a
 ; GFX9-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_v2f64_test10:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, 0xbff00000
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v9, v8, v9, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_v2f64_test10:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v9, 0x3fe00000
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v10, 0xbff00000
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v9, v10, v9, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v7
+; GFX942-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[8:9]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_ldexp_f64 v[2:3], v[2:3], v4
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: fmul_select_v2f64_test10:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1255,6 +1613,28 @@ define double @fmul_select_f64_test11(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_f64_test11:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, v4, -2.0, vcc
+; GFX942-SDAG-NEXT:    v_mul_f64 v[0:1], v[0:1], v[2:3]
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_f64_test11:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v5, v5, -2.0, vcc
+; GFX942-GISEL-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f64_test11:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1300,6 +1680,17 @@ define double @fmul_select_f64_test12(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_f64_test12:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX942-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f64_test12:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1345,6 +1736,17 @@ define double @fmul_select_f64_test13(double %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_f64_test13:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0x40300000
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX942-NEXT:    v_mul_f64 v[0:1], v[0:1], v[4:5]
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f64_test13:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1410,6 +1812,28 @@ define double @fmul_select_f64_test14_sel_log2val_pos92_neg27(double %x, i32 %bo
 ; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_not_b32_e32 v4, 26
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 0x5c
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX942-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0x5c
+; GFX942-GISEL-NEXT:    v_not_b32_e32 v5, 26
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX942-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: fmul_select_f64_test14_sel_log2val_pos92_neg27:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1494,6 +1918,28 @@ define double @fmul_select_f64_test15_sel_log2val_neg42_neg33(double %x, i32 %bo
 ; GFX9-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_not_b32_e32 v4, 32
+; GFX942-SDAG-NEXT:    v_not_b32_e32 v5, 41
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v2, v4, v5, vcc
+; GFX942-SDAG-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_not_b32_e32 v4, 41
+; GFX942-GISEL-NEXT:    v_not_b32_e32 v5, 32
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v3
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
+; GFX942-GISEL-NEXT:    v_ldexp_f64 v[0:1], v[0:1], v2
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: fmul_select_f64_test15_sel_log2val_neg42_neg33:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1578,6 +2024,26 @@ define half @fmul_select_f16_test1(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_f16_test1:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_f16_test1:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX942-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: fmul_select_f16_test1:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1684,6 +2150,28 @@ define half @fmul_select_f16_test2(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_f16_test2:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX942-SDAG-NEXT:    v_med3_i32 v1, v1, s0, v2
+; GFX942-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_f16_test2:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX942-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX942-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: fmul_select_f16_test2:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1813,6 +2301,38 @@ define <2 x half> @fmul_select_v2f16_test3(<2 x half> %x, <2 x i32> %bool.arg1,
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_v2f16_test3:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3c00
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, 0x4000
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX942-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX942-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_v2f16_test3:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX942-GISEL-NEXT:    v_med3_i32 v1, v1, v3, v4
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_med3_i32 v2, v2, v3, v4
+; GFX942-GISEL-NEXT:    v_ldexp_f16_e32 v1, v0, v1
+; GFX942-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: fmul_select_v2f16_test3:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1970,6 +2490,38 @@ define <2 x half> @fmul_select_v2f16_test4(<2 x half> %x, <2 x i32> %bool.arg1,
 ; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_v2f16_test4:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 0x3c00
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, 0x3800
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v2, v5, v6, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v6, vcc
+; GFX942-SDAG-NEXT:    v_pack_b32_f16 v1, v1, v2
+; GFX942-SDAG-NEXT:    v_pk_mul_f16 v0, v0, v1
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_v2f16_test4:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0xffff8000
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7fff
+; GFX942-GISEL-NEXT:    v_med3_i32 v1, v1, v3, v4
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX942-GISEL-NEXT:    v_med3_i32 v2, v2, v3, v4
+; GFX942-GISEL-NEXT:    v_ldexp_f16_e32 v1, v0, v1
+; GFX942-GISEL-NEXT:    v_ldexp_f16_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX942-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: fmul_select_v2f16_test4:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2096,6 +2648,15 @@ define half @fmul_select_f16_test5(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_f16_test5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, 3, 1, vcc
+; GFX942-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f16_test5:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2191,6 +2752,28 @@ define half @fmul_select_f16_test6(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_f16_test6:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0x4200
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 0xc800
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX942-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_f16_test6:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0xc800
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0x4200
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX942-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: fmul_select_f16_test6:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2299,6 +2882,28 @@ define half @fmul_select_f16_test7(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_f16_test7:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0xc400
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, 0x4800
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX942-SDAG-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_f16_test7:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0x4800
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, 0xc400
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX942-GISEL-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: fmul_select_f16_test7:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2395,6 +3000,16 @@ define half @fmul_select_f16_test8(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-NEXT:    v_mul_f16_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_f16_test8:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x8000
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc
+; GFX942-NEXT:    v_mul_f16_e32 v0, v0, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_f16_test8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2486,6 +3101,27 @@ define half @fmul_select_f16_test9(half %x, i32 %bool.arg1, i32 %bool.arg2) {
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_f16_test9:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v1, 5, 4, vcc
+; GFX942-SDAG-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_f16_test9:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0xffff8000
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7fff
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v1, 5, v1
+; GFX942-GISEL-NEXT:    v_med3_i32 v1, v1, v2, v3
+; GFX942-GISEL-NEXT:    v_ldexp_f16_e64 v0, -v0, v1
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: fmul_select_f16_test9:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2594,6 +3230,26 @@ define half @fmul_select_f16_test10_sel_log2val_neg11_pos11(half %x, i32 %bool.a
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX942-SDAG-NEXT:    v_med3_i32 v1, v1, s0, v2
+; GFX942-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 11, -11, vcc
+; GFX942-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: fmul_select_f16_test10_sel_log2val_neg11_pos11:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2697,6 +3353,26 @@ define half @fmul_select_f16_test11_sel_log2val_pos7_neg14(half %x, i32 %bool.ar
 ; GFX9-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-SDAG-NEXT:    s_movk_i32 s0, 0x8000
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7fff
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX942-SDAG-NEXT:    v_med3_i32 v1, v1, s0, v2
+; GFX942-SDAG-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, -14, 7, vcc
+; GFX942-GISEL-NEXT:    v_ldexp_f16_e32 v0, v0, v1
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: fmul_select_f16_test11_sel_log2val_pos7_neg14:
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2790,6 +3466,25 @@ define bfloat @fmul_select_bf16_test1(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_bf16_test1:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x3f80
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX942-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX942-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GFX942-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_bf16_test1:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2924,6 +3619,25 @@ define bfloat @fmul_select_bf16_test2(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_bf16_test2:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x3f80
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x3f00
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX942-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX942-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GFX942-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_bf16_test2:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3075,6 +3789,36 @@ define <2 x bfloat> @fmul_select_v2bf16_test3(<2 x bfloat> %x, <2 x i32> %bool.a
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_v2bf16_test3:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x3f80
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX942-NEXT:    v_cndmask_b32_sdwa v3, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    s_mov_b64 vcc, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX942-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX942-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX942-NEXT:    v_add3_u32 v2, v2, v1, s0
+; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX942-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX942-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; GFX942-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX942-NEXT:    v_add3_u32 v2, v2, v0, s0
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT:    s_mov_b32 s0, 0x7060302
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX942-NEXT:    v_perm_b32 v0, v0, v1, s0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_v2bf16_test3:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3284,6 +4028,36 @@ define <2 x bfloat> @fmul_select_v2bf16_test4(<2 x bfloat> %x, <2 x i32> %bool.a
 ; GFX9-NEXT:    v_perm_b32 v0, v0, v1, s4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_v2bf16_test4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], v1, v3
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x3f80
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x3f00
+; GFX942-NEXT:    v_cndmask_b32_sdwa v3, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    s_mov_b64 vcc, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, v2, v1
+; GFX942-NEXT:    v_bfe_u32 v2, v1, 16, 1
+; GFX942-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
+; GFX942-NEXT:    v_add3_u32 v2, v2, v1, s0
+; GFX942-NEXT:    v_or_b32_e32 v4, 0x400000, v1
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v1, v1
+; GFX942-NEXT:    v_mul_f32_e32 v0, v0, v3
+; GFX942-NEXT:    v_or_b32_e32 v3, 0x400000, v0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v4, vcc
+; GFX942-NEXT:    v_bfe_u32 v2, v0, 16, 1
+; GFX942-NEXT:    v_add3_u32 v2, v2, v0, s0
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT:    s_mov_b32 s0, 0x7060302
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX942-NEXT:    v_perm_b32 v0, v0, v1, s0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_v2bf16_test4:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3477,6 +4251,25 @@ define bfloat @fmul_select_bf16_test5(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_bf16_test5:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x4100
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX942-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX942-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GFX942-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_bf16_test5:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3613,6 +4406,25 @@ define bfloat @fmul_select_bf16_test6(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_bf16_test6:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x4040
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0xffffc100
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX942-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX942-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GFX942-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_bf16_test6:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3748,6 +4560,25 @@ define bfloat @fmul_select_bf16_test7(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_bf16_test7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0xffffc080
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x4100
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX942-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX942-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GFX942-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_bf16_test7:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3883,6 +4714,25 @@ define bfloat @fmul_select_bf16_test8(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_bf16_test8:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0xffff8000
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX942-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX942-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GFX942-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_bf16_test8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4014,6 +4864,25 @@ define bfloat @fmul_select_bf16_test9(bfloat %x, i32 %bool.arg1, i32 %bool.arg2)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_bf16_test9:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0xffffc200
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0xffffc180
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX942-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX942-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GFX942-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_bf16_test9:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4150,6 +5019,25 @@ define bfloat @fmul_select_bf16_test10_sel_log2val_pos65_pos56(bfloat %x, i32 %b
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0xffffdb80
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0xffffe000
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX942-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX942-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GFX942-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_bf16_test10_sel_log2val_pos65_pos56:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -4286,6 +5174,25 @@ define bfloat @fmul_select_bf16_test11_sel_log2val_neg22_pos25(bfloat %x, i32 %b
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v2
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x4c00
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x3480
+; GFX942-NEXT:    v_cndmask_b32_sdwa v1, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    v_mul_f32_e32 v0, v0, v1
+; GFX942-NEXT:    v_bfe_u32 v1, v0, 16, 1
+; GFX942-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX942-NEXT:    v_add3_u32 v1, v1, v0, s0
+; GFX942-NEXT:    v_or_b32_e32 v2, 0x400000, v0
+; GFX942-NEXT:    v_cmp_u_f32_e32 vcc, v0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: fmul_select_bf16_test11_sel_log2val_neg22_pos25:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
index 85180a2dc6348..c429b1a32bde6 100644
--- a/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
+++ b/llvm/test/CodeGen/AMDGPU/dagcombine-select.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX942 %s
 
 define amdgpu_kernel void @select_and1(ptr addrspace(1) %p, i32 %x, i32 %y) {
 ; GCN-LABEL: select_and1:
@@ -56,24 +57,43 @@ define amdgpu_kernel void @select_and3(ptr addrspace(1) %p, i32 %x, i32 %y) {
 }
 
 define amdgpu_kernel void @select_and_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %y) {
-; GCN-LABEL: select_and_v4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s8, s[4:5], 0x2c
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_gt_i32 s8, 10
-; GCN-NEXT:    s_cselect_b32 s3, s3, 0
-; GCN-NEXT:    s_cselect_b32 s2, s2, 0
-; GCN-NEXT:    s_cselect_b32 s1, s1, 0
-; GCN-NEXT:    s_cselect_b32 s0, s0, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
-; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GCN-NEXT:    s_endpgm
+; GFX9-LABEL: select_and_v4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_cmp_gt_i32 s8, 10
+; GFX9-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX9-NEXT:    s_cselect_b32 s2, s2, 0
+; GFX9-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX9-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: select_and_v4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s8, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_gt_i32 s8, 10
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cselect_b32 s2, s2, 0
+; GFX942-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX942-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v5, s3
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
   %a = and <4 x i32> %s, %y
@@ -136,24 +156,43 @@ define amdgpu_kernel void @select_or3(ptr addrspace(1) %p, i32 %x, i32 %y) {
 }
 
 define amdgpu_kernel void @select_or_v4(ptr addrspace(1) %p, i32 %x, <4 x i32> %y) {
-; GCN-LABEL: select_or_v4:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s8, s[4:5], 0x2c
-; GCN-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
-; GCN-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_cmp_lt_i32 s8, 11
-; GCN-NEXT:    s_cselect_b32 s3, s3, -1
-; GCN-NEXT:    s_cselect_b32 s2, s2, -1
-; GCN-NEXT:    s_cselect_b32 s1, s1, -1
-; GCN-NEXT:    s_cselect_b32 s0, s0, -1
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_mov_b32_e32 v2, s2
-; GCN-NEXT:    v_mov_b32_e32 v3, s3
-; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
-; GCN-NEXT:    s_endpgm
+; GFX9-LABEL: select_or_v4:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s8, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_cmp_lt_i32 s8, 11
+; GFX9-NEXT:    s_cselect_b32 s3, s3, -1
+; GFX9-NEXT:    s_cselect_b32 s2, s2, -1
+; GFX9-NEXT:    s_cselect_b32 s1, s1, -1
+; GFX9-NEXT:    s_cselect_b32 s0, s0, -1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    v_mov_b32_e32 v3, s3
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: select_or_v4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s8, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_lt_i32 s8, 11
+; GFX942-NEXT:    s_cselect_b32 s3, s3, -1
+; GFX942-NEXT:    s_cselect_b32 s2, s2, -1
+; GFX942-NEXT:    s_cselect_b32 s1, s1, -1
+; GFX942-NEXT:    s_cselect_b32 s0, s0, -1
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v5, s3
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %c = icmp slt i32 %x, 11
   %s = select i1 %c, <4 x i32> zeroinitializer, <4 x i32> <i32 -1, i32 -1, i32 -1, i32 -1>
   %a = or <4 x i32> %s, %y
@@ -236,23 +275,41 @@ define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v2i16(ptr ad
 }
 
 define amdgpu_kernel void @sel_constants_sub_constant_sel_constants_v4i32(ptr addrspace(1) %p, i1 %cond) {
-; GCN-LABEL: sel_constants_sub_constant_sel_constants_v4i32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bitcmp1_b32 s2, 0
-; GCN-NEXT:    s_cselect_b32 s2, 7, 14
-; GCN-NEXT:    s_cselect_b32 s3, 6, 10
-; GCN-NEXT:    s_cselect_b32 s4, 5, 6
-; GCN-NEXT:    s_cselect_b32 s5, 9, 2
-; GCN-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-NEXT:    v_mov_b32_e32 v3, s2
-; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
-; GCN-NEXT:    s_endpgm
+; GFX9-LABEL: sel_constants_sub_constant_sel_constants_v4i32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_bitcmp1_b32 s2, 0
+; GFX9-NEXT:    s_cselect_b32 s2, 7, 14
+; GFX9-NEXT:    s_cselect_b32 s3, 6, 10
+; GFX9-NEXT:    s_cselect_b32 s4, 5, 6
+; GFX9-NEXT:    s_cselect_b32 s5, 9, 2
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sel_constants_sub_constant_sel_constants_v4i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_bitcmp1_b32 s2, 0
+; GFX942-NEXT:    s_cselect_b32 s2, 7, 14
+; GFX942-NEXT:    s_cselect_b32 s3, 6, 10
+; GFX942-NEXT:    s_cselect_b32 s4, 5, 6
+; GFX942-NEXT:    s_cselect_b32 s5, 9, 2
+; GFX942-NEXT:    v_mov_b32_e32 v2, s5
+; GFX942-NEXT:    v_mov_b32_e32 v3, s4
+; GFX942-NEXT:    v_mov_b32_e32 v4, s3
+; GFX942-NEXT:    v_mov_b32_e32 v5, s2
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %sel = select i1 %cond, <4 x i32> <i32 -4, i32 2, i32 3, i32 4>, <4 x i32> <i32 3, i32 1, i32 -1, i32 -3>
   %bo = sub <4 x i32> <i32 5, i32 7, i32 9, i32 11>, %sel
   store <4 x i32> %bo, ptr addrspace(1) %p, align 32
@@ -461,24 +518,43 @@ define amdgpu_kernel void @fsub_constant_sel_constants_v2f16(ptr addrspace(1) %p
 }
 
 define amdgpu_kernel void @fsub_constant_sel_constants_v4f32(ptr addrspace(1) %p, i1 %cond) {
-; GCN-LABEL: fsub_constant_sel_constants_v4f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_load_dword s2, s[4:5], 0x2c
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN-NEXT:    s_mov_b32 s3, 0x41500000
-; GCN-NEXT:    v_mov_b32_e32 v4, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_bitcmp1_b32 s2, 0
-; GCN-NEXT:    s_cselect_b32 s2, s3, 0x40c00000
-; GCN-NEXT:    s_cselect_b32 s3, 0x41100000, 4.0
-; GCN-NEXT:    s_cselect_b32 s4, 0x40a00000, 2.0
-; GCN-NEXT:    s_cselect_b32 s5, 1.0, 0
-; GCN-NEXT:    v_mov_b32_e32 v0, s5
-; GCN-NEXT:    v_mov_b32_e32 v1, s4
-; GCN-NEXT:    v_mov_b32_e32 v2, s3
-; GCN-NEXT:    v_mov_b32_e32 v3, s2
-; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
-; GCN-NEXT:    s_endpgm
+; GFX9-LABEL: fsub_constant_sel_constants_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    s_mov_b32 s3, 0x41500000
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_bitcmp1_b32 s2, 0
+; GFX9-NEXT:    s_cselect_b32 s2, s3, 0x40c00000
+; GFX9-NEXT:    s_cselect_b32 s3, 0x41100000, 4.0
+; GFX9-NEXT:    s_cselect_b32 s4, 0x40a00000, 2.0
+; GFX9-NEXT:    s_cselect_b32 s5, 1.0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9-NEXT:    v_mov_b32_e32 v1, s4
+; GFX9-NEXT:    v_mov_b32_e32 v2, s3
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: fsub_constant_sel_constants_v4f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0x41500000
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_bitcmp1_b32 s2, 0
+; GFX942-NEXT:    s_cselect_b32 s2, s3, 0x40c00000
+; GFX942-NEXT:    s_cselect_b32 s3, 0x41100000, 4.0
+; GFX942-NEXT:    s_cselect_b32 s4, 0x40a00000, 2.0
+; GFX942-NEXT:    s_cselect_b32 s5, 1.0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s5
+; GFX942-NEXT:    v_mov_b32_e32 v3, s4
+; GFX942-NEXT:    v_mov_b32_e32 v4, s3
+; GFX942-NEXT:    v_mov_b32_e32 v5, s2
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %sel = select i1 %cond, <4 x float> <float -2.0, float -3.0, float -4.0, float -5.0>, <4 x float> <float -1.0, float 0.0, float 1.0, float 2.0>
   %bo = fsub <4 x float> <float -1.0, float 2.0, float 5.0, float 8.0>, %sel
   store <4 x float> %bo, ptr addrspace(1) %p, align 32
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 747affa928601..0b6ef4cbed259 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -5,6 +5,12 @@
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-G %s
 ; RUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-G-O0 %s
 
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck -check-prefixes=GFX942 %s
+; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck -check-prefixes=GFX942-O0 %s
+
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck -check-prefixes=GFX942-G %s
+; RUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck -check-prefixes=GFX942-G-O0 %s
+
 define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-LABEL: v_sdiv_i128_vv:
 ; GFX9:       ; %bb.0: ; %_udiv-special-cases
@@ -2303,6 +2309,2093 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_sdiv_i128_vv:
+; GFX942:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_sub_co_u32_e32 v12, vcc, 0, v0
+; GFX942-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v13, vcc, 0, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v14, vcc, 0, v2, vcc
+; GFX942-NEXT:    v_ashrrev_i32_e32 v10, 31, v7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v15, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f
+; GFX942-NEXT:    v_mov_b32_e32 v9, v8
+; GFX942-NEXT:    v_cndmask_b32_e32 v13, v1, v13, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v12, v0, v12, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v15, v3, v15, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v14, v2, v14, vcc
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, 0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v11, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, 0, v5, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v2, vcc, 0, v6, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[6:7]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX942-NEXT:    v_or_b32_e32 v5, v1, v3
+; GFX942-NEXT:    v_or_b32_e32 v4, v0, v2
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; GFX942-NEXT:    v_or_b32_e32 v5, v13, v15
+; GFX942-NEXT:    v_or_b32_e32 v4, v12, v14
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[4:5]
+; GFX942-NEXT:    v_ffbh_u32_e32 v4, v2
+; GFX942-NEXT:    v_add_u32_e32 v4, 32, v4
+; GFX942-NEXT:    v_ffbh_u32_e32 v5, v3
+; GFX942-NEXT:    v_min_u32_e32 v6, v4, v5
+; GFX942-NEXT:    v_ffbh_u32_e32 v4, v0
+; GFX942-NEXT:    v_add_u32_e32 v4, 32, v4
+; GFX942-NEXT:    v_ffbh_u32_e32 v5, v1
+; GFX942-NEXT:    v_min_u32_e32 v16, v4, v5
+; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[16:17], 0, 64
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v6, v4, v6, vcc
+; GFX942-NEXT:    v_ffbh_u32_e32 v4, v14
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, v5, 0, vcc
+; GFX942-NEXT:    v_add_u32_e32 v4, 32, v4
+; GFX942-NEXT:    v_ffbh_u32_e32 v5, v15
+; GFX942-NEXT:    v_min_u32_e32 v18, v4, v5
+; GFX942-NEXT:    v_ffbh_u32_e32 v4, v12
+; GFX942-NEXT:    v_add_u32_e32 v4, 32, v4
+; GFX942-NEXT:    v_ffbh_u32_e32 v5, v13
+; GFX942-NEXT:    v_min_u32_e32 v16, v4, v5
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[16:17], 0, 64
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v4, v18, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, v5, 0, vcc
+; GFX942-NEXT:    v_sub_co_u32_e32 v18, vcc, v6, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v19, vcc, v7, v5, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v6, vcc, 0, v17, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v7, vcc, 0, v17, vcc
+; GFX942-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[18:19]
+; GFX942-NEXT:    v_or_b32_e32 v17, v19, v7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, 0, 1, vcc
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, 0, 1, vcc
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX942-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v4
+; GFX942-NEXT:    v_xor_b32_e32 v4, 0x7f, v18
+; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-NEXT:    v_or_b32_e32 v16, v4, v6
+; GFX942-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, v15, 0, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, v14, 0, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v17, v13, 0, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v16, v12, 0, s[0:1]
+; GFX942-NEXT:    s_and_b64 s[0:1], s[2:3], vcc
+; GFX942-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB0_6
+; GFX942-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, 1, v18
+; GFX942-NEXT:    v_mov_b64_e32 v[22:23], 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v19, vcc
+; GFX942-NEXT:    v_sub_u32_e32 v19, 0x7f, v18
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; GFX942-NEXT:    v_or_b32_e32 v16, v4, v6
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_or_b32_e32 v17, v5, v7
+; GFX942-NEXT:    v_sub_u32_e32 v20, 64, v19
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; GFX942-NEXT:    v_lshlrev_b64 v[16:17], v19, v[14:15]
+; GFX942-NEXT:    v_lshrrev_b64 v[20:21], v20, v[12:13]
+; GFX942-NEXT:    v_or_b32_e32 v20, v16, v20
+; GFX942-NEXT:    v_sub_u32_e32 v16, 63, v18
+; GFX942-NEXT:    v_or_b32_e32 v21, v17, v21
+; GFX942-NEXT:    v_lshlrev_b64 v[16:17], v16, v[12:13]
+; GFX942-NEXT:    v_cmp_gt_u32_e64 s[0:1], 64, v19
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v19
+; GFX942-NEXT:    v_lshlrev_b64 v[18:19], v19, v[12:13]
+; GFX942-NEXT:    v_cndmask_b32_e64 v17, v17, v21, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v16, v16, v20, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v17, v17, v15, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e64 v16, v16, v14, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e64 v19, 0, v19, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v18, 0, v18, s[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], 0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[2:3], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB0_5
+; GFX942-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-NEXT:    v_sub_u32_e32 v22, 64, v4
+; GFX942-NEXT:    v_lshrrev_b64 v[20:21], v4, v[12:13]
+; GFX942-NEXT:    v_lshlrev_b64 v[22:23], v22, v[14:15]
+; GFX942-NEXT:    v_or_b32_e32 v22, v20, v22
+; GFX942-NEXT:    v_subrev_u32_e32 v20, 64, v4
+; GFX942-NEXT:    v_or_b32_e32 v23, v21, v23
+; GFX942-NEXT:    v_lshrrev_b64 v[20:21], v20, v[14:15]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX942-NEXT:    v_mov_b64_e32 v[28:29], 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v21, v21, v23, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v25, v21, v13, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e32 v13, v20, v22, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v24, v13, v12, s[0:1]
+; GFX942-NEXT:    v_lshrrev_b64 v[12:13], v4, v[14:15]
+; GFX942-NEXT:    v_cndmask_b32_e32 v27, 0, v13, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v26, 0, v12, vcc
+; GFX942-NEXT:    v_add_co_u32_e32 v12, vcc, -1, v0
+; GFX942-NEXT:    v_mov_b64_e32 v[22:23], 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v13, vcc, -1, v1, vcc
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v14, vcc, -1, v2, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v21, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v3, vcc
+; GFX942-NEXT:  .LBB0_3: ; %udiv-do-while
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v20, 31, v25
+; GFX942-NEXT:    v_lshlrev_b64 v[24:25], 1, v[24:25]
+; GFX942-NEXT:    v_lshrrev_b32_e32 v32, 31, v17
+; GFX942-NEXT:    v_lshlrev_b64 v[26:27], 1, v[26:27]
+; GFX942-NEXT:    v_or_b32_e32 v24, v24, v32
+; GFX942-NEXT:    v_or_b32_e32 v26, v26, v20
+; GFX942-NEXT:    v_sub_co_u32_e32 v20, vcc, v12, v24
+; GFX942-NEXT:    v_lshlrev_b64 v[30:31], 1, v[18:19]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v20, vcc, v13, v25, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v18, 31, v19
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v20, vcc, v14, v26, vcc
+; GFX942-NEXT:    v_lshlrev_b64 v[16:17], 1, v[16:17]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v20, vcc, v15, v27, vcc
+; GFX942-NEXT:    v_or3_b32 v16, v16, v18, v22
+; GFX942-NEXT:    v_or_b32_e32 v18, v28, v30
+; GFX942-NEXT:    v_ashrrev_i32_e32 v28, 31, v20
+; GFX942-NEXT:    v_or_b32_e32 v19, v29, v31
+; GFX942-NEXT:    v_and_b32_e32 v20, 1, v28
+; GFX942-NEXT:    v_and_b32_e32 v29, v28, v3
+; GFX942-NEXT:    v_and_b32_e32 v30, v28, v2
+; GFX942-NEXT:    v_and_b32_e32 v31, v28, v1
+; GFX942-NEXT:    v_and_b32_e32 v28, v28, v0
+; GFX942-NEXT:    v_sub_co_u32_e32 v24, vcc, v24, v28
+; GFX942-NEXT:    v_or3_b32 v17, v17, 0, v23
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v25, vcc, v25, v31, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v26, vcc, v26, v30, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v27, vcc, v27, v29, vcc
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, -1, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v5, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v6, vcc, -1, v6, vcc
+; GFX942-NEXT:    v_or_b32_e32 v28, v4, v6
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, -1, v7, vcc
+; GFX942-NEXT:    v_or_b32_e32 v29, v5, v7
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[28:29]
+; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[28:29], v[20:21]
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execnz .LBB0_3
+; GFX942-NEXT:  ; %bb.4: ; %Flow
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:  .LBB0_5: ; %Flow2
+; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    v_lshlrev_b64 v[0:1], 1, v[18:19]
+; GFX942-NEXT:    v_lshrrev_b32_e32 v4, 31, v19
+; GFX942-NEXT:    v_lshlrev_b64 v[2:3], 1, v[16:17]
+; GFX942-NEXT:    v_or3_b32 v5, v3, 0, v23
+; GFX942-NEXT:    v_or3_b32 v4, v2, v4, v22
+; GFX942-NEXT:    v_or_b32_e32 v17, v21, v1
+; GFX942-NEXT:    v_or_b32_e32 v16, v20, v0
+; GFX942-NEXT:  .LBB0_6: ; %Flow3
+; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-NEXT:    v_xor_b32_e32 v2, v10, v8
+; GFX942-NEXT:    v_xor_b32_e32 v3, v11, v9
+; GFX942-NEXT:    v_xor_b32_e32 v0, v16, v2
+; GFX942-NEXT:    v_xor_b32_e32 v1, v17, v3
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-NEXT:    v_xor_b32_e32 v4, v4, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v5, v5, v3
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v2, vcc, v4, v2, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v5, v3, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-O0-LABEL: v_sdiv_i128_vv:
+; GFX942-O0:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-O0-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-O0-NEXT:    scratch_store_dword off, v29, s32 offset:184 ; 4-byte Folded Spill
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v20, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v8, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v21, v7
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v5
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v11
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-O0-NEXT:    ; implicit-def: $vgpr29 : SGPR spill to VGPR lane
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s2, 0
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s3, 1
+; GFX942-O0-NEXT:    s_mov_b32 s6, s2
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s6, 2
+; GFX942-O0-NEXT:    s_mov_b32 s7, s3
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s7, 3
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, s6, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, s7
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v0, v2, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v0, v14, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, s7
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v15, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[10:11], s[0:1]
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v3, v2, v3, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v6
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[0:1]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v1, v15, v0, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v0, v14, v0, s[0:1]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v8, v9
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v21
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v18, vcc, s6, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, s7
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v7, v8, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, s6
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v7, v11, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, s7
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v9, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, v19
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[20:21], s[0:1]
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v10, v8, v10, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v8, v18
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v8, s[0:1]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v10
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v13
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v8, v9, v7, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v12
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v7, v11, v7, s[0:1]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v8
+; GFX942-O0-NEXT:    v_xor_b32_e64 v9, v9, v15
+; GFX942-O0-NEXT:    v_xor_b32_e64 v14, v11, v14
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v9
+; GFX942-O0-NEXT:    s_mov_b32 s0, 63
+; GFX942-O0-NEXT:    v_ashrrev_i64 v[14:15], s0, v[14:15]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a0, v15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a1, v14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a2, v15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a3, v14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[14:15], v[12:13]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a4, v15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a5, v14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[14:15], v[18:19]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a6, v15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a7, v14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[14:15], v[4:5]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a8, v15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a9, v14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[14:15], v[16:17]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a10, v15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a11, v14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v13
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v19
+; GFX942-O0-NEXT:    v_or_b32_e64 v9, v9, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v18
+; GFX942-O0-NEXT:    v_or_b32_e64 v14, v11, v14
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v9
+; GFX942-O0-NEXT:    v_cmp_eq_u64_e64 s[0:1], v[14:15], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v17
+; GFX942-O0-NEXT:    v_or_b32_e64 v9, v9, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v16
+; GFX942-O0-NEXT:    v_or_b32_e64 v14, v11, v14
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v9
+; GFX942-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[14:15], s[2:3]
+; GFX942-O0-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[12:13], s[4:5]
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v7, v7
+; GFX942-O0-NEXT:    s_mov_b32 s11, 32
+; GFX942-O0-NEXT:    v_add_u32_e64 v7, v7, s11
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v8, v8
+; GFX942-O0-NEXT:    v_min_u32_e64 v8, v7, v8
+; GFX942-O0-NEXT:    s_mov_b32 s10, 0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX942-O0-NEXT:    v_add_u32_e64 v6, v6, s11
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v10, v10
+; GFX942-O0-NEXT:    v_min_u32_e64 v10, v6, v10
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, s10
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v6
+; GFX942-O0-NEXT:    s_mov_b64 s[8:9], 64
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[10:11], v[10:11], 0, s[8:9]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v8, v7, v8, s[4:5]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v6
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[4:5], s[4:5]
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v4, v0
+; GFX942-O0-NEXT:    v_add_u32_e64 v4, v4, s11
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v5, v1
+; GFX942-O0-NEXT:    v_min_u32_e64 v6, v4, v5
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s10
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v4, v2
+; GFX942-O0-NEXT:    v_add_u32_e64 v4, v4, s11
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v10, v3
+; GFX942-O0-NEXT:    v_min_u32_e64 v10, v4, v10
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s10
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[10:11], v[10:11], 0, s[8:9]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v11
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v10, v5, v6, s[4:5]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v7
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v6, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, s6
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v5, v6, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, s7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, s7
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v5, v6, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a12, v5 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a13, v4 ; Reload Reuse
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v6
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a14, v9 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a15, v8 ; Reload Reuse
+; GFX942-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[8:9], s[2:3]
+; GFX942-O0-NEXT:    s_mov_b64 s[8:9], 0x7f
+; GFX942-O0-NEXT:    v_cmp_gt_u64_e64 s[10:11], v[4:5], s[8:9]
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[10:11]
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[10:11], v[8:9], s[2:3]
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[10:11]
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX942-O0-NEXT:    v_and_b32_e64 v6, 1, v6
+; GFX942-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, 1
+; GFX942-O0-NEXT:    s_or_b64 s[4:5], s[0:1], s[4:5]
+; GFX942-O0-NEXT:    s_mov_b64 s[10:11], -1
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; GFX942-O0-NEXT:    s_xor_b64 s[0:1], s[0:1], s[10:11]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX942-O0-NEXT:    s_mov_b32 s10, s9
+; GFX942-O0-NEXT:    v_xor_b32_e64 v6, v6, s10
+; GFX942-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX942-O0-NEXT:    v_xor_b32_e64 v4, v4, s8
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[2:3], v[4:5], s[2:3]
+; GFX942-O0-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s7
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX942-O0-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX942-O0-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s7
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-O0-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a16, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a17, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a18, v1 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a19, v0 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s0, 4
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s1, 5
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a20, v29 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-O0-NEXT:    s_cbranch_execz .LBB0_3
+; GFX942-O0-NEXT:    s_branch .LBB0_8
+; GFX942-O0-NEXT:  .LBB0_1: ; %Flow
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a20 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    v_readlane_b32 s0, v29, 6
+; GFX942-O0-NEXT:    v_readlane_b32 s1, v29, 7
+; GFX942-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-O0-NEXT:  ; %bb.2: ; %Flow
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v7, a21 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v6, a22 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v5, a23 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v4, a24 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v3, a25 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v2, a26 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v1, a27 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v0, a28 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a29, v7 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a30, v6 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a31, v5 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_store_dword off, v4, s32 offset:16 ; 4-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:8 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_branch .LBB0_5
+; GFX942-O0-NEXT:  .LBB0_3: ; %Flow2
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a20 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    v_readlane_b32 s0, v29, 4
+; GFX942-O0-NEXT:    v_readlane_b32 s1, v29, 5
+; GFX942-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v1, a16 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v0, a17 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v3, a18 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v2, a19 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:32 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:24 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_branch .LBB0_9
+; GFX942-O0-NEXT:  .LBB0_4: ; %udiv-loop-exit
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[6:7], off, s32 offset:40 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[8:9], off, s32 offset:48 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:56 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[4:5], off, s32 offset:64 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    s_mov_b32 s0, 1
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[2:3], s0, v[0:1]
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[10:11], s0, v[4:5]
+; GFX942-O0-NEXT:    s_mov_b32 s0, 63
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[0:1], s0, v[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v9
+; GFX942-O0-NEXT:    v_or3_b32 v4, v4, v5, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX942-O0-NEXT:    v_or3_b32 v0, v0, v1, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v7
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v6
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a16, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a17, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a18, v1 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a19, v0 ; Reload Reuse
+; GFX942-O0-NEXT:    s_branch .LBB0_3
+; GFX942-O0-NEXT:  .LBB0_5: ; %Flow1
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a20 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    v_readlane_b32 s0, v29, 8
+; GFX942-O0-NEXT:    v_readlane_b32 s1, v29, 9
+; GFX942-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v1, a29 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v0, a30 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v3, a31 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_load_dword v2, off, s32 offset:16 ; 4-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[4:5], off, s32 offset:8 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[6:7], off, s32 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:48 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:40 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:64 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:56 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_branch .LBB0_4
+; GFX942-O0-NEXT:  .LBB0_6: ; %udiv-do-while
+; GFX942-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a20 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    v_readlane_b32 s2, v29, 10
+; GFX942-O0-NEXT:    v_readlane_b32 s3, v29, 11
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[6:7], off, s32 offset:72 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:80 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[26:27], off, s32 offset:88 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[2:3], off, s32 offset:96 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[16:17], off, s32 offset:104 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[8:9], off, s32 offset:112 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[24:25], off, s32 offset:120 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[10:11], off, s32 offset:128 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v19, a6 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v18, a7 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v21, a4 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v20, a5 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[14:15], off, s32 offset:136 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[22:23], off, s32 offset:144 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    s_mov_b32 s0, 63
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(6)
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[12:13], s0, v[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX942-O0-NEXT:    s_mov_b32 s1, 1
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[26:27], s1, v[26:27]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v27
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 killed $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v26
+; GFX942-O0-NEXT:    v_or_b32_e64 v12, v5, v12
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v4
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[26:27], s1, v[2:3]
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[4:5], s0, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v27
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v26
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v3, v4
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[2:3], s1, v[0:1]
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[26:27], s1, v[6:7]
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[0:1], s0, v[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v27
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v28, v25
+; GFX942-O0-NEXT:    v_or3_b32 v6, v6, v7, v28
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v26
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v24
+; GFX942-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX942-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v10
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v22
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v23
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v11, v4, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-O0-NEXT:    v_ashrrev_i64 v[14:15], s0, v[12:13]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], 1
+; GFX942-O0-NEXT:    s_mov_b32 s4, s1
+; GFX942-O0-NEXT:    v_and_b32_e64 v12, v7, s4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX942-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX942-O0-NEXT:    v_and_b32_e64 v14, v11, s0
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[12:13], 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v21
+; GFX942-O0-NEXT:    v_and_b32_e64 v22, v7, v22
+; GFX942-O0-NEXT:    v_and_b32_e64 v20, v11, v20
+; GFX942-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v21, v22
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v19
+; GFX942-O0-NEXT:    v_and_b32_e64 v7, v7, v22
+; GFX942-O0-NEXT:    v_and_b32_e64 v22, v11, v18
+; GFX942-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v23, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v22
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v23
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v19
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v8
+; GFX942-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], -1
+; GFX942-O0-NEXT:    s_mov_b32 s1, s4
+; GFX942-O0-NEXT:    s_mov_b32 s0, s5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, s1
+; GFX942-O0-NEXT:    v_add_co_u32_e32 v20, vcc, v11, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, s0
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, s1
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, s0
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v21, v9
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v8
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[8:9], v[16:17]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[10:11], v[20:21]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v17
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v21
+; GFX942-O0-NEXT:    v_or_b32_e64 v18, v18, v19
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, v20
+; GFX942-O0-NEXT:    v_or_b32_e64 v16, v16, v17
+; GFX942-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX942-O0-NEXT:    v_cmp_eq_u64_e64 s[0:1], v[16:17], v[12:13]
+; GFX942-O0-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[16:17], v[2:3]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a21, v17 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a22, v16 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[16:17], v[0:1]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a23, v17 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a24, v16 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[16:17], v[14:15]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a25, v17 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a26, v16 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[16:17], v[12:13]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a27, v17 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a28, v16 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s2, 6
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s3, 7
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s2, 10
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s3, 11
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a20, v29 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[14:15], s32 offset:128 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[12:13], s32 offset:120 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[10:11], s32 offset:112 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[8:9], s32 offset:104 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:96 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:88 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:80 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:72 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-O0-NEXT:    s_cbranch_execnz .LBB0_6
+; GFX942-O0-NEXT:    s_branch .LBB0_1
+; GFX942-O0-NEXT:  .LBB0_7: ; %udiv-preheader
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a20 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:152 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[2:3], off, s32 offset:160 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[8:9], off, s32 offset:168 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[10:11], off, s32 offset:176 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v17, a4 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v16, a5 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v13, a6 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v12, a7 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v15, a8 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v14, a9 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v19, a10 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v18, a11 ; Reload Reuse
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[18:19]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    s_mov_b32 s0, 64
+; GFX942-O0-NEXT:    v_sub_u32_e64 v20, s0, v4
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v21
+; GFX942-O0-NEXT:    v_or_b32_e64 v5, v5, v22
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v20
+; GFX942-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v7
+; GFX942-O0-NEXT:    v_cmp_lt_u32_e64 s[2:3], v4, s0
+; GFX942-O0-NEXT:    v_sub_u32_e64 v5, v4, s0
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[20:21], v5, v[14:15]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v21
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v22, s[2:3]
+; GFX942-O0-NEXT:    s_mov_b32 s0, 0
+; GFX942-O0-NEXT:    v_cmp_eq_u32_e64 s[0:1], v4, s0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v19
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v22, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v20
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v18
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[0:1]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[14:15]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v5
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-O0-NEXT:    s_mov_b32 s4, s1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, s4
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-O0-NEXT:    s_mov_b32 s4, s0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[2:3]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v13
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], -1
+; GFX942-O0-NEXT:    s_mov_b32 s3, s4
+; GFX942-O0-NEXT:    s_mov_b32 s2, s5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, s3
+; GFX942-O0-NEXT:    v_add_co_u32_e32 v16, vcc, v15, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, s2
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v15, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, s3
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v18, vcc, v14, v15, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, s2
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v14, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v13
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v12
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[14:15], s[0:1]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[18:19], s32 offset:136 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[16:17], s32 offset:144 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s0, 10
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s1, 11
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a20, v29 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[14:15], s32 offset:128 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[12:13], s32 offset:120 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[10:11], s32 offset:112 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[8:9], s32 offset:104 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:96 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:88 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:80 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:72 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_branch .LBB0_6
+; GFX942-O0-NEXT:  .LBB0_8: ; %udiv-bb1
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a20 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v7, a10 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v6, a11 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v11, a8 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v10, a9 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v5, a14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v4, a15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v1, a12 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v0, a13 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], 1
+; GFX942-O0-NEXT:    s_mov_b32 s1, s2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-O0-NEXT:    s_mov_b32 s0, s3
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-O0-NEXT:    s_mov_b32 s4, s2
+; GFX942-O0-NEXT:    s_mov_b32 s5, s3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s0
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s5
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:168 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:176 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_mov_b32 s0, 0x7f
+; GFX942-O0-NEXT:    v_sub_u32_e64 v2, s0, v3
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[10:11]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v5
+; GFX942-O0-NEXT:    s_mov_b32 s0, 64
+; GFX942-O0-NEXT:    v_sub_u32_e64 v13, s0, v2
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[14:15], v13, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v15
+; GFX942-O0-NEXT:    v_or_b32_e64 v12, v12, v13
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX942-O0-NEXT:    v_cmp_lt_u32_e64 s[0:1], v2, s0
+; GFX942-O0-NEXT:    s_mov_b32 s6, 63
+; GFX942-O0-NEXT:    v_sub_u32_e64 v3, s6, v3
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[12:13], v3, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[0:1]
+; GFX942-O0-NEXT:    s_mov_b32 s6, 0
+; GFX942-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v2, s6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v11
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[6:7]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[6:7], v2, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, s4
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v3, v6, s[0:1]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:160 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:152 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v8
+; GFX942-O0-NEXT:    v_or_b32_e64 v0, v0, v1
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[0:1], v[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a29, v7 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a30, v6 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a31, v5 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_store_dword off, v4, s32 offset:16 ; 4-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:8 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-O0-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX942-O0-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s2, 8
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s3, 9
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a20, v29 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-O0-NEXT:    s_cbranch_execz .LBB0_5
+; GFX942-O0-NEXT:    s_branch .LBB0_7
+; GFX942-O0-NEXT:  .LBB0_9: ; %udiv-end
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:24 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v7, a2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v6, a3 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[8:9], off, s32 offset:32 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v5, a0 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v4, a1 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-O0-NEXT:    v_xor_b32_e64 v3, v3, v2
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX942-O0-NEXT:    v_xor_b32_e64 v8, v5, v4
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-O0-NEXT:    v_xor_b32_e64 v3, v3, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_xor_b32_e64 v0, v0, v7
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v9
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v7
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v5, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v3, v4, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX942-O0-NEXT:    s_mov_b32 s0, 32
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[6:7], s0, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[4:5], s0, v[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-O0-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-O0-NEXT:    scratch_load_dword v29, off, s32 offset:184 ; 4-byte Folded Reload
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-G-LABEL: v_sdiv_i128_vv:
+; GFX942-G:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-G-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-G-NEXT:    v_ashrrev_i32_e32 v20, 31, v3
+; GFX942-G-NEXT:    v_xor_b32_e32 v0, v20, v0
+; GFX942-G-NEXT:    v_xor_b32_e32 v1, v20, v1
+; GFX942-G-NEXT:    v_sub_co_u32_e32 v10, vcc, v0, v20
+; GFX942-G-NEXT:    v_xor_b32_e32 v2, v20, v2
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v11, vcc, v1, v20, vcc
+; GFX942-G-NEXT:    v_ashrrev_i32_e32 v21, 31, v7
+; GFX942-G-NEXT:    v_xor_b32_e32 v3, v20, v3
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v12, vcc, v2, v20, vcc
+; GFX942-G-NEXT:    v_xor_b32_e32 v0, v21, v4
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v13, vcc, v3, v20, vcc
+; GFX942-G-NEXT:    v_xor_b32_e32 v1, v21, v5
+; GFX942-G-NEXT:    v_sub_co_u32_e32 v22, vcc, v0, v21
+; GFX942-G-NEXT:    v_xor_b32_e32 v2, v21, v6
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v23, vcc, v1, v21, vcc
+; GFX942-G-NEXT:    v_xor_b32_e32 v3, v21, v7
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v8, vcc, v2, v21, vcc
+; GFX942-G-NEXT:    v_or_b32_e32 v0, v22, v8
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v9, vcc, v3, v21, vcc
+; GFX942-G-NEXT:    v_or_b32_e32 v1, v23, v9
+; GFX942-G-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-G-NEXT:    v_or_b32_e32 v0, v10, v12
+; GFX942-G-NEXT:    v_or_b32_e32 v1, v11, v13
+; GFX942-G-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[0:1]
+; GFX942-G-NEXT:    v_ffbh_u32_e32 v1, v22
+; GFX942-G-NEXT:    v_ffbh_u32_e32 v0, v23
+; GFX942-G-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX942-G-NEXT:    v_ffbh_u32_e32 v2, v8
+; GFX942-G-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX942-G-NEXT:    v_ffbh_u32_e32 v1, v9
+; GFX942-G-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-G-NEXT:    v_add_u32_e32 v0, 64, v0
+; GFX942-G-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX942-G-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[8:9]
+; GFX942-G-NEXT:    v_ffbh_u32_e32 v2, v10
+; GFX942-G-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[2:3]
+; GFX942-G-NEXT:    v_ffbh_u32_e32 v1, v11
+; GFX942-G-NEXT:    v_ffbh_u32_e32 v3, v12
+; GFX942-G-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX942-G-NEXT:    v_ffbh_u32_e32 v2, v13
+; GFX942-G-NEXT:    v_add_u32_e32 v3, 32, v3
+; GFX942-G-NEXT:    v_add_u32_e32 v1, 64, v1
+; GFX942-G-NEXT:    v_min_u32_e32 v2, v2, v3
+; GFX942-G-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[12:13]
+; GFX942-G-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-G-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[2:3]
+; GFX942-G-NEXT:    v_sub_co_u32_e64 v4, s[2:3], v0, v1
+; GFX942-G-NEXT:    v_mov_b64_e32 v[0:1], 0x7f
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e64 v5, s[2:3], 0, 0, s[2:3]
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_subb_co_u32_e64 v2, s[2:3], 0, 0, s[2:3]
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_subb_co_u32_e64 v3, s[2:3], 0, 0, s[2:3]
+; GFX942-G-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[4:5], v[0:1]
+; GFX942-G-NEXT:    v_or_b32_e32 v15, v5, v3
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX942-G-NEXT:    v_cmp_lt_u64_e64 s[2:3], 0, v[2:3]
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; GFX942-G-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[2:3]
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[2:3]
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX942-G-NEXT:    v_or_b32_e32 v16, v1, v0
+; GFX942-G-NEXT:    v_xor_b32_e32 v0, 0x7f, v4
+; GFX942-G-NEXT:    v_or_b32_e32 v14, v0, v2
+; GFX942-G-NEXT:    v_and_b32_e32 v0, 1, v16
+; GFX942-G-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v0, v10, 0, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v1, v11, 0, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v6, v12, 0, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v7, v13, 0, vcc
+; GFX942-G-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v14, 0, 1, vcc
+; GFX942-G-NEXT:    v_or_b32_e32 v14, v16, v14
+; GFX942-G-NEXT:    v_and_b32_e32 v14, 1, v14
+; GFX942-G-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v14
+; GFX942-G-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-G-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX942-G-NEXT:    s_cbranch_execz .LBB0_6
+; GFX942-G-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-G-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v4
+; GFX942-G-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX942-G-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-G-NEXT:    v_sub_co_u32_e32 v16, vcc, 0x7f, v4
+; GFX942-G-NEXT:    v_sub_u32_e32 v6, 64, v16
+; GFX942-G-NEXT:    v_add_u32_e32 v17, 0xffffffc0, v16
+; GFX942-G-NEXT:    v_lshrrev_b64 v[6:7], v6, v[10:11]
+; GFX942-G-NEXT:    v_lshlrev_b64 v[14:15], v16, v[12:13]
+; GFX942-G-NEXT:    v_lshlrev_b64 v[4:5], v16, v[10:11]
+; GFX942-G-NEXT:    v_or_b32_e32 v18, v6, v14
+; GFX942-G-NEXT:    v_or_b32_e32 v19, v7, v15
+; GFX942-G-NEXT:    v_lshlrev_b64 v[6:7], v17, v[10:11]
+; GFX942-G-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v14, 0, v4, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v15, 0, v5, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v4, v6, v18, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v5, v7, v19, vcc
+; GFX942-G-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v16
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v16, v4, v12, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v17, v5, v13, vcc
+; GFX942-G-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-G-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-G-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-G-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GFX942-G-NEXT:    s_cbranch_execz .LBB0_5
+; GFX942-G-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-G-NEXT:    v_sub_u32_e32 v18, 64, v0
+; GFX942-G-NEXT:    v_add_u32_e32 v24, 0xffffffc0, v0
+; GFX942-G-NEXT:    v_lshrrev_b64 v[6:7], v0, v[10:11]
+; GFX942-G-NEXT:    v_lshlrev_b64 v[18:19], v18, v[12:13]
+; GFX942-G-NEXT:    v_lshrrev_b64 v[4:5], v0, v[12:13]
+; GFX942-G-NEXT:    v_or_b32_e32 v18, v6, v18
+; GFX942-G-NEXT:    v_or_b32_e32 v19, v7, v19
+; GFX942-G-NEXT:    v_lshrrev_b64 v[6:7], v24, v[12:13]
+; GFX942-G-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
+; GFX942-G-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-G-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v6, v6, v18, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v7, v7, v19, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v18, 0, v4, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v19, 0, v5, vcc
+; GFX942-G-NEXT:    v_add_co_u32_e32 v24, vcc, -1, v22
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v12, v6, v10, s[0:1]
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_addc_co_u32_e32 v25, vcc, -1, v23, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v13, v7, v11, s[0:1]
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_addc_co_u32_e32 v26, vcc, -1, v8, vcc
+; GFX942-G-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_addc_co_u32_e32 v27, vcc, -1, v9, vcc
+; GFX942-G-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-G-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-G-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-G-NEXT:  .LBB0_3: ; %udiv-do-while
+; GFX942-G-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-G-NEXT:    v_lshlrev_b64 v[18:19], 1, v[18:19]
+; GFX942-G-NEXT:    v_lshrrev_b32_e32 v10, 31, v13
+; GFX942-G-NEXT:    v_lshlrev_b64 v[6:7], 1, v[12:13]
+; GFX942-G-NEXT:    v_or_b32_e32 v18, v18, v10
+; GFX942-G-NEXT:    v_lshrrev_b32_e32 v10, 31, v17
+; GFX942-G-NEXT:    v_or_b32_e32 v6, v6, v10
+; GFX942-G-NEXT:    v_lshlrev_b64 v[12:13], 1, v[14:15]
+; GFX942-G-NEXT:    v_or_b32_e32 v14, v4, v12
+; GFX942-G-NEXT:    v_sub_co_u32_e32 v4, vcc, v24, v6
+; GFX942-G-NEXT:    v_lshlrev_b64 v[16:17], 1, v[16:17]
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v4, vcc, v25, v7, vcc
+; GFX942-G-NEXT:    v_lshrrev_b32_e32 v10, 31, v15
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v4, vcc, v26, v18, vcc
+; GFX942-G-NEXT:    v_add_co_u32_e64 v0, s[0:1], -1, v0
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v4, vcc, v27, v19, vcc
+; GFX942-G-NEXT:    v_ashrrev_i32_e32 v12, 31, v4
+; GFX942-G-NEXT:    v_or_b32_e32 v16, v16, v10
+; GFX942-G-NEXT:    v_and_b32_e32 v10, 1, v12
+; GFX942-G-NEXT:    v_addc_co_u32_e64 v1, s[0:1], -1, v1, s[0:1]
+; GFX942-G-NEXT:    v_or_b32_e32 v15, v5, v13
+; GFX942-G-NEXT:    v_mov_b64_e32 v[4:5], v[10:11]
+; GFX942-G-NEXT:    v_and_b32_e32 v10, v12, v22
+; GFX942-G-NEXT:    v_addc_co_u32_e64 v2, s[0:1], -1, v2, s[0:1]
+; GFX942-G-NEXT:    v_and_b32_e32 v13, v12, v23
+; GFX942-G-NEXT:    v_and_b32_e32 v28, v12, v8
+; GFX942-G-NEXT:    v_and_b32_e32 v29, v12, v9
+; GFX942-G-NEXT:    v_sub_co_u32_e32 v12, vcc, v6, v10
+; GFX942-G-NEXT:    v_addc_co_u32_e64 v3, s[0:1], -1, v3, s[0:1]
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v13, vcc, v7, v13, vcc
+; GFX942-G-NEXT:    v_or_b32_e32 v6, v0, v2
+; GFX942-G-NEXT:    v_or_b32_e32 v7, v1, v3
+; GFX942-G-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[6:7]
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v18, vcc, v18, v28, vcc
+; GFX942-G-NEXT:    s_or_b64 s[4:5], s[0:1], s[4:5]
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v19, vcc, v19, v29, vcc
+; GFX942-G-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-G-NEXT:    s_cbranch_execnz .LBB0_3
+; GFX942-G-NEXT:  ; %bb.4: ; %Flow
+; GFX942-G-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-G-NEXT:  .LBB0_5: ; %Flow2
+; GFX942-G-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-G-NEXT:    v_lshlrev_b64 v[0:1], 1, v[14:15]
+; GFX942-G-NEXT:    v_lshlrev_b64 v[6:7], 1, v[16:17]
+; GFX942-G-NEXT:    v_lshrrev_b32_e32 v2, 31, v15
+; GFX942-G-NEXT:    v_or_b32_e32 v6, v6, v2
+; GFX942-G-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX942-G-NEXT:    v_or_b32_e32 v1, v5, v1
+; GFX942-G-NEXT:  .LBB0_6: ; %Flow3
+; GFX942-G-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-G-NEXT:    v_xor_b32_e32 v3, v21, v20
+; GFX942-G-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX942-G-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX942-G-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX942-G-NEXT:    v_xor_b32_e32 v2, v6, v3
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-G-NEXT:    v_xor_b32_e32 v4, v7, v3
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v3, vcc
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX942-G-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-G-O0-LABEL: v_sdiv_i128_vv:
+; GFX942-G-O0:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-G-O0-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-G-O0-NEXT:    scratch_store_dword off, v32, s32 offset:212 ; 4-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, v1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, v2
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v10
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, v8
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a0, v3 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a1, v2 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a2, v1 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a3, v0 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v12, v4
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v7, a0 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v6, a1 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v5, a2 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v4, a3 ; Reload Reuse
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v13, v2
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v14, v1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v15, v0
+; GFX942-G-O0-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-G-O0-NEXT:    s_mov_b64 s[4:5], 0x7f
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr4_vgpr5 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[0:1], v[6:7]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr0 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-G-O0-NEXT:    v_ashrrev_i32_e64 v11, v2, v3
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr0 killed $exec
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-G-O0-NEXT:    v_ashrrev_i32_e64 v9, v0, v1
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr12_vgpr13 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[0:1], v[14:15]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr0 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-G-O0-NEXT:    v_ashrrev_i32_e64 v10, v2, v3
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr0 killed $exec
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-G-O0-NEXT:    v_ashrrev_i32_e64 v8, v0, v1
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v0, v11, v0
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v1, v11, v1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v3, v9, v3
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v2, v9, v2
+; GFX942-G-O0-NEXT:    v_sub_co_u32_e64 v0, s[2:3], v0, v11
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a4, v0 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v1, s[2:3], v1, v11, s[2:3]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a5, v1 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v5, s[2:3], v3, v9, s[2:3]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a6, v5 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v2, s[2:3], v2, v9, s[2:3]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a7, v2 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[6:7], v[12:13]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[12:13], v[14:15]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v7, v10, v4
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v4, v10, v3
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, v12
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v6, v8, v6
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v3, v8, v3
+; GFX942-G-O0-NEXT:    v_sub_co_u32_e64 v7, s[2:3], v7, v10
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a8, v7 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v4, s[2:3], v4, v10, s[2:3]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a9, v4 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v6, s[2:3], v6, v8, s[2:3]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a10, v6 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v3, s[2:3], v3, v8, s[2:3]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a11, v3 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v12, v10, v11
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a12, v12 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v10, v10, v11
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a13, v10 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v10, v8, v9
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a14, v10 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v8, v8, v9
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a15, v8 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v8, v7, v6
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v10, v4, v3
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, v10
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[10:11], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_eq_u64_e64 s[2:3], v[8:9], v[10:11]
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v8, v0, v5
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v10, v1, v2
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, v10
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[10:11], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_eq_u64_e64 s[6:7], v[8:9], v[10:11]
+; GFX942-G-O0-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[10:11], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_eq_u64_e64 s[6:7], v[8:9], v[10:11]
+; GFX942-G-O0-NEXT:    v_ffbh_u32_e64 v4, v4
+; GFX942-G-O0-NEXT:    v_ffbh_u32_e64 v7, v7
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, 32
+; GFX942-G-O0-NEXT:    v_add_u32_e64 v7, v7, v8
+; GFX942-G-O0-NEXT:    v_min_u32_e64 v4, v4, v7
+; GFX942-G-O0-NEXT:    s_mov_b32 s8, 64
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, s8
+; GFX942-G-O0-NEXT:    v_add_u32_e64 v4, v4, v7
+; GFX942-G-O0-NEXT:    v_ffbh_u32_e64 v3, v3
+; GFX942-G-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, 32
+; GFX942-G-O0-NEXT:    v_add_u32_e64 v6, v6, v7
+; GFX942-G-O0-NEXT:    v_min_u32_e64 v3, v3, v6
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[6:7]
+; GFX942-G-O0-NEXT:    s_mov_b32 s12, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_eq_u64_e64 s[6:7], v[6:7], v[8:9]
+; GFX942-G-O0-NEXT:    v_ffbh_u32_e64 v4, v1
+; GFX942-G-O0-NEXT:    v_ffbh_u32_e64 v6, v0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, 32
+; GFX942-G-O0-NEXT:    v_add_u32_e64 v6, v6, v7
+; GFX942-G-O0-NEXT:    v_min_u32_e64 v4, v4, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, s8
+; GFX942-G-O0-NEXT:    v_add_u32_e64 v6, v4, v6
+; GFX942-G-O0-NEXT:    v_ffbh_u32_e64 v4, v2
+; GFX942-G-O0-NEXT:    v_ffbh_u32_e64 v7, v5
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, 32
+; GFX942-G-O0-NEXT:    v_add_u32_e64 v7, v7, v8
+; GFX942-G-O0-NEXT:    v_min_u32_e64 v4, v4, v7
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v6, s[6:7]
+; GFX942-G-O0-NEXT:    s_mov_b32 s11, 0
+; GFX942-G-O0-NEXT:    s_mov_b32 s9, 0
+; GFX942-G-O0-NEXT:    s_mov_b32 s10, 0
+; GFX942-G-O0-NEXT:    s_mov_b32 s8, 0
+; GFX942-G-O0-NEXT:    v_sub_co_u32_e64 v6, s[6:7], v3, v4
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a16, v6 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, s12
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a17, v3 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, s11
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v8, s[6:7], v4, v7, s[6:7]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a18, v8 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, s9
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, s8
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v7, s[6:7], v4, v7, s[6:7]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a19, v7 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v12, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v13, v3
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v11, v7
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[14:15], s[4:5]
+; GFX942-G-O0-NEXT:    v_cmp_gt_u64_e64 s[8:9], v[12:13], v[14:15]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[10:11], v[12:13]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[10:11], v[12:13]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, 1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v9, v4, v9, s[8:9]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, 1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[6:7]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v9, v4, v9, s[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, 1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[2:3]
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v4, v4, v9
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, 0x7f
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v6, v6, s2
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v6, v6, v8
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v3, v3, v7
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_eq_u64_e64 s[0:1], v[6:7], v[8:9]
+; GFX942-G-O0-NEXT:    v_and_b32_e32 v3, 1, v4
+; GFX942-G-O0-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v3
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[2:3]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v3, v1, v3, s[2:3]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-G-O0-NEXT:    v_and_b32_e32 v3, 1, v4
+; GFX942-G-O0-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v3
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v6, v5, v6, s[2:3]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[2:3]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, 1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-G-O0-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX942-G-O0-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX942-G-O0-NEXT:    s_mov_b64 s[2:3], -1
+; GFX942-G-O0-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a20, v3 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a21, v2 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a22, v1 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a23, v0 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-G-O0-NEXT:    ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s0, 0
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s1, 1
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a24, v32 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-G-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-G-O0-NEXT:    s_cbranch_execz .LBB0_3
+; GFX942-G-O0-NEXT:    s_branch .LBB0_8
+; GFX942-G-O0-NEXT:  .LBB0_1: ; %Flow
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v32, a24 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-G-O0-NEXT:    v_readlane_b32 s0, v32, 2
+; GFX942-G-O0-NEXT:    v_readlane_b32 s1, v32, 3
+; GFX942-G-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-G-O0-NEXT:  ; %bb.2: ; %Flow
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v7, a25 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v6, a26 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v5, a27 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v4, a28 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v3, a29 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v2, a30 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v1, a31 ; Reload Reuse
+; GFX942-G-O0-NEXT:    scratch_load_dword v0, off, s32 offset:32 ; 4-byte Folded Reload
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[4:7], s32 offset:16 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_branch .LBB0_5
+; GFX942-G-O0-NEXT:  .LBB0_3: ; %Flow2
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v32, a24 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-G-O0-NEXT:    v_readlane_b32 s0, v32, 0
+; GFX942-G-O0-NEXT:    v_readlane_b32 s1, v32, 1
+; GFX942-G-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v3, a20 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v2, a21 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v1, a22 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v0, a23 ; Reload Reuse
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_branch .LBB0_9
+; GFX942-G-O0-NEXT:  .LBB0_4: ; %udiv-loop-exit
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[12:15], off, s32 offset:64 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[4:7], off, s32 offset:80 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-G-O0-NEXT:    s_mov_b32 s0, 1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[10:11], v0, v[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[0:1], v0, v[4:5]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr2 killed $exec
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
+; GFX942-G-O0-NEXT:    s_mov_b32 s0, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-G-O0-NEXT:    v_lshrrev_b32_e64 v5, v2, v3
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[6:7], v[12:13]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[8:9], v[14:15]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v11
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v0, v0, v6
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v2, v1, v2
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX942-G-O0-NEXT:    v_or3_b32 v4, v4, v5, v6
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a20, v3 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a21, v2 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a22, v1 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a23, v0 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_branch .LBB0_3
+; GFX942-G-O0-NEXT:  .LBB0_5: ; %Flow1
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v32, a24 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-G-O0-NEXT:    v_readlane_b32 s0, v32, 4
+; GFX942-G-O0-NEXT:    v_readlane_b32 s1, v32, 5
+; GFX942-G-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[0:3], off, s32 offset:16 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[4:7], off, s32 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[4:7], s32 offset:64 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:80 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_branch .LBB0_4
+; GFX942-G-O0-NEXT:  .LBB0_6: ; %udiv-do-while
+; GFX942-G-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v32, a24 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-G-O0-NEXT:    v_readlane_b32 s2, v32, 6
+; GFX942-G-O0-NEXT:    v_readlane_b32 s3, v32, 7
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[22:25], off, s32 offset:96 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[2:5], off, s32 offset:112 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[16:19], off, s32 offset:128 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[28:31], off, s32 offset:144 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v20, a11 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v8, a10 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v21, a9 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v11, a8 ; Reload Reuse
+; GFX942-G-O0-NEXT:    scratch_load_dword v6, off, s32 offset:160 ; 4-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dword v10, off, s32 offset:164 ; 4-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dword v12, off, s32 offset:168 ; 4-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dword v13, off, s32 offset:172 ; 4-byte Folded Reload
+; GFX942-G-O0-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(6)
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[14:15], v[4:5]
+; GFX942-G-O0-NEXT:    s_mov_b32 s4, 1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[2:3], v2, v[0:1]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[4:5], v4, v[14:15]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr0 killed $exec
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec
+; GFX942-G-O0-NEXT:    s_mov_b32 s5, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942-G-O0-NEXT:    v_lshrrev_b32_e64 v1, v0, v1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr4_vgpr5 killed $exec
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v7, v0, v1
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr22_vgpr23 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[0:1], v[24:25]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr0 killed $exec
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec
+; GFX942-G-O0-NEXT:    s_mov_b32 s5, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942-G-O0-NEXT:    v_lshrrev_b32_e64 v1, v0, v1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v4, v0, v1
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[2:3], v[22:23]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[14:15], v[24:25]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[26:27], v0, v[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[0:1], v0, v[14:15]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr2 killed $exec
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
+; GFX942-G-O0-NEXT:    s_mov_b32 s4, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-G-O0-NEXT:    v_lshrrev_b32_e64 v15, v2, v3
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v14, v0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(4)
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[22:23], v[28:29]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[24:25], v[30:31]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, v22
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v23
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v22, v26
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v27
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v0, v0, v22
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v2, v1, v2
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v22, v24
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v25
+; GFX942-G-O0-NEXT:    v_or3_b32 v14, v14, v15, v22
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v15, v2
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-G-O0-NEXT:    v_sub_co_u32_e64 v13, s[4:5], v13, v4
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v12, s[4:5], v12, v9, s[4:5]
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[4:5], v10, v7, s[4:5]
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v12, s[4:5], v6, v5, s[4:5]
+; GFX942-G-O0-NEXT:    s_mov_b32 s4, 31
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, s4
+; GFX942-G-O0-NEXT:    v_ashrrev_i32_e64 v10, v6, v12
+; GFX942-G-O0-NEXT:    s_mov_b32 s4, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, s4
+; GFX942-G-O0-NEXT:    v_ashrrev_i32_e64 v6, v6, v12
+; GFX942-G-O0-NEXT:    s_mov_b32 s5, 1
+; GFX942-G-O0-NEXT:    s_mov_b32 s4, 0
+; GFX942-G-O0-NEXT:    v_and_b32_e64 v12, v10, s5
+; GFX942-G-O0-NEXT:    v_and_b32_e64 v14, v10, s4
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v13, v14
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[22:23], s[0:1]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[14:15], v[22:23]
+; GFX942-G-O0-NEXT:    v_and_b32_e64 v11, v10, v11
+; GFX942-G-O0-NEXT:    v_and_b32_e64 v10, v10, v21
+; GFX942-G-O0-NEXT:    v_and_b32_e64 v8, v6, v8
+; GFX942-G-O0-NEXT:    v_and_b32_e64 v6, v6, v20
+; GFX942-G-O0-NEXT:    v_sub_co_u32_e64 v4, s[4:5], v4, v11
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[4:5], v9, v10, s[4:5]
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v9, s[4:5], v7, v8, s[4:5]
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v8, s[4:5], v5, v6, s[4:5]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, v8
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v11, v16
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, v17
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, v18
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, v19
+; GFX942-G-O0-NEXT:    s_mov_b32 s4, -1
+; GFX942-G-O0-NEXT:    s_mov_b32 s8, -1
+; GFX942-G-O0-NEXT:    s_mov_b32 s7, -1
+; GFX942-G-O0-NEXT:    s_mov_b32 s6, -1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v16, s4
+; GFX942-G-O0-NEXT:    v_add_co_u32_e64 v16, s[4:5], v11, v16
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v11, s8
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v17, s[4:5], v10, v11, s[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, s7
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v19, s[4:5], v9, v10, s[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, s6
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v18, s[4:5], v8, v9, s[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, v16
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, v17
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, v19
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v11, v18
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v16, v16, v19
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v18, v17, v18
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_eq_u64_e64 s[0:1], v[16:17], v[18:19]
+; GFX942-G-O0-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[18:19], v[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[16:17], v[0:1]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a25, v19 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a26, v18 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a27, v17 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a28, v16 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[18:19], v[14:15]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[16:17], v[12:13]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a29, v19 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a30, v18 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a31, v17 ; Reload Reuse
+; GFX942-G-O0-NEXT:    scratch_store_dword off, v16, s32 offset:32 ; 4-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s2, 2
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s3, 3
+; GFX942-G-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s2, 6
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s3, 7
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a24, v32 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[12:15], s32 offset:144 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[8:11], s32 offset:128 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[4:7], s32 offset:112 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:96 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-G-O0-NEXT:    s_cbranch_execnz .LBB0_6
+; GFX942-G-O0-NEXT:    s_branch .LBB0_1
+; GFX942-G-O0-NEXT:  .LBB0_7: ; %udiv-preheader
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v32, a24 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[0:3], off, s32 offset:176 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[8:11], off, s32 offset:192 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v12, a11 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v13, a10 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v14, a9 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v15, a8 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v16, a5 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v17, a4 ; Reload Reuse
+; GFX942-G-O0-NEXT:    scratch_load_dword v18, off, s32 offset:208 ; 4-byte Folded Reload
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v4, a7 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v20, a6 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b32 s0, 64
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v22, v17
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v23, v16
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v21, v4
+; GFX942-G-O0-NEXT:    s_mov_b32 s1, 0xffffffc0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-G-O0-NEXT:    v_add_u32_e64 v4, v18, v4
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, s0
+; GFX942-G-O0-NEXT:    v_sub_u32_e64 v5, v5, v18
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, s0
+; GFX942-G-O0-NEXT:    v_cmp_lt_u32_e64 s[0:1], v18, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, s2
+; GFX942-G-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v18, v6
+; GFX942-G-O0-NEXT:    v_lshrrev_b64 v[6:7], v18, v[20:21]
+; GFX942-G-O0-NEXT:    v_lshrrev_b64 v[22:23], v18, v[22:23]
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[24:25], v5, v[20:21]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v19, v22
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v23
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v22, v24
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v18, v25
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v19, v19, v22
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v18, v5, v18
+; GFX942-G-O0-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-G-O0-NEXT:    v_lshrrev_b64 v[20:21], v4, v[20:21]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, v20
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v21
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v19, s[0:1]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v18, s[0:1]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v17, s[2:3]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v16, v5, v16, s[2:3]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v17, v6
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[0:1]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[0:1]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v17, v6
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[6:7], v[16:17]
+; GFX942-G-O0-NEXT:    s_mov_b32 s0, -1
+; GFX942-G-O0-NEXT:    s_mov_b32 s6, -1
+; GFX942-G-O0-NEXT:    s_mov_b32 s3, -1
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, -1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v16, s0
+; GFX942-G-O0-NEXT:    v_add_co_u32_e64 v15, s[0:1], v15, v16
+; GFX942-G-O0-NEXT:    scratch_store_dword off, v15, s32 offset:172 ; 4-byte Folded Spill
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v15, s6
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v14, s[0:1], v14, v15, s[0:1]
+; GFX942-G-O0-NEXT:    scratch_store_dword off, v14, s32 offset:168 ; 4-byte Folded Spill
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v14, s3
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v13, s[0:1], v13, v14, s[0:1]
+; GFX942-G-O0-NEXT:    scratch_store_dword off, v13, s32 offset:164 ; 4-byte Folded Spill
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v13, s2
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v12, s[0:1], v12, v13, s[0:1]
+; GFX942-G-O0-NEXT:    scratch_store_dword off, v12, s32 offset:160 ; 4-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; GFX942-G-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s4, 6
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s5, 7
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a24, v32 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[12:15], s32 offset:144 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[8:11], s32 offset:128 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[4:7], s32 offset:112 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:96 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_branch .LBB0_6
+; GFX942-G-O0-NEXT:  .LBB0_8: ; %udiv-bb1
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v32, a24 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v3, a7 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v9, a6 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v0, a5 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v12, a4 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v2, a16 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v1, a19 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v6, a18 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v5, a17 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, 1
+; GFX942-G-O0-NEXT:    s_mov_b32 s6, 0
+; GFX942-G-O0-NEXT:    s_mov_b32 s5, 0
+; GFX942-G-O0-NEXT:    s_mov_b32 s4, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-G-O0-NEXT:    v_add_co_u32_e64 v4, s[2:3], v2, v4
+; GFX942-G-O0-NEXT:    scratch_store_dword off, v4, s32 offset:208 ; 4-byte Folded Spill
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, s6
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v5, s[2:3], v5, v7, s[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, s5
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v7, s[2:3], v6, v7, s[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, s4
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v6, s[2:3], v1, v6, s[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v14, v4
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v15, v5
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v16, v7
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v17, v6
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[14:17], s32 offset:192 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, 0x7f
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-G-O0-NEXT:    v_sub_co_u32_e64 v8, s[2:3], v1, v2
+; GFX942-G-O0-NEXT:    s_mov_b32 s3, 64
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v13, v0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, v9
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v11, v3
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, 0xffffffc0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-G-O0-NEXT:    v_add_u32_e64 v2, v8, v0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s3
+; GFX942-G-O0-NEXT:    v_sub_u32_e64 v14, v0, v8
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s3
+; GFX942-G-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v8, v0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-G-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v8, v0
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[0:1], v8, v[12:13]
+; GFX942-G-O0-NEXT:    v_lshrrev_b64 v[14:15], v14, v[12:13]
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[16:17], v8, v[10:11]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, v15
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, v17
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v11, v11, v14
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v10, v8, v10
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[12:13], v2, v[12:13]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, v0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, v12
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v13
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v8, v8, v11, s[4:5]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v10, s[4:5]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[2:3]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, v2
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:176 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-G-O0-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v4, v4, v7
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v6, v5, v6
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_ne_u64_e64 s[0:1], v[4:5], v[6:7]
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-G-O0-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX942-G-O0-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s2, 4
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s3, 5
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a24, v32 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-G-O0-NEXT:    s_cbranch_execz .LBB0_5
+; GFX942-G-O0-NEXT:    s_branch .LBB0_7
+; GFX942-G-O0-NEXT:  .LBB0_9: ; %udiv-end
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v4, a15 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v5, a14 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v6, a13 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v7, a12 ; Reload Reuse
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[8:11], off, s32 offset:48 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[8:9], v[10:11]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v0, v0, v7
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v1, v1, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v8
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v2, v2, v5
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v3, v3, v4
+; GFX942-G-O0-NEXT:    v_sub_co_u32_e64 v0, s[0:1], v0, v7
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v1, s[0:1], v1, v6, s[0:1]
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v2, s[0:1], v2, v5, s[0:1]
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v4, s[0:1]
+; GFX942-G-O0-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-G-O0-NEXT:    scratch_load_dword v32, off, s32 offset:212 ; 4-byte Folded Reload
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-G-O0-NEXT:    s_setpc_b64 s[30:31]
   %div = sdiv i128 %lhs, %rhs
   ret i128 %div
 }
@@ -4386,6 +6479,1847 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-G-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-G-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-G-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_udiv_i128_vv:
+; GFX942:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_or_b32_e32 v9, v5, v7
+; GFX942-NEXT:    v_or_b32_e32 v8, v4, v6
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX942-NEXT:    v_or_b32_e32 v9, v1, v3
+; GFX942-NEXT:    v_or_b32_e32 v8, v0, v2
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[8:9]
+; GFX942-NEXT:    v_ffbh_u32_e32 v8, v6
+; GFX942-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX942-NEXT:    v_ffbh_u32_e32 v9, v7
+; GFX942-NEXT:    v_min_u32_e32 v12, v8, v9
+; GFX942-NEXT:    v_ffbh_u32_e32 v8, v4
+; GFX942-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX942-NEXT:    v_ffbh_u32_e32 v10, v5
+; GFX942-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-NEXT:    v_min_u32_e32 v8, v8, v10
+; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    v_lshl_add_u64 v[10:11], v[8:9], 0, 64
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-NEXT:    v_ffbh_u32_e32 v8, v2
+; GFX942-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX942-NEXT:    v_cndmask_b32_e32 v12, v10, v12, vcc
+; GFX942-NEXT:    v_ffbh_u32_e32 v10, v3
+; GFX942-NEXT:    v_min_u32_e32 v14, v8, v10
+; GFX942-NEXT:    v_ffbh_u32_e32 v8, v0
+; GFX942-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX942-NEXT:    v_ffbh_u32_e32 v10, v1
+; GFX942-NEXT:    v_min_u32_e32 v8, v8, v10
+; GFX942-NEXT:    v_cndmask_b32_e64 v13, v11, 0, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[10:11], v[8:9], 0, 64
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, v11, 0, vcc
+; GFX942-NEXT:    v_sub_co_u32_e32 v12, vcc, v12, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v13, vcc, v13, v8, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v14, vcc, 0, v9, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v15, vcc, 0, v9, vcc
+; GFX942-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[12:13]
+; GFX942-NEXT:    v_or_b32_e32 v11, v13, v15
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX942-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX942-NEXT:    v_xor_b32_e32 v8, 0x7f, v12
+; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-NEXT:    v_or_b32_e32 v10, v8, v14
+; GFX942-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, v3, 0, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, v2, 0, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v11, v1, 0, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v10, v0, 0, s[0:1]
+; GFX942-NEXT:    s_and_b64 s[0:1], s[2:3], vcc
+; GFX942-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB1_6
+; GFX942-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v12
+; GFX942-NEXT:    v_sub_u32_e32 v18, 0x7f, v12
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v13, vcc
+; GFX942-NEXT:    v_sub_u32_e32 v13, 64, v18
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v14, vcc
+; GFX942-NEXT:    v_or_b32_e32 v14, v8, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v15, vcc
+; GFX942-NEXT:    v_or_b32_e32 v15, v9, v11
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX942-NEXT:    v_lshlrev_b64 v[14:15], v18, v[2:3]
+; GFX942-NEXT:    v_lshrrev_b64 v[16:17], v13, v[0:1]
+; GFX942-NEXT:    v_sub_u32_e32 v12, 63, v12
+; GFX942-NEXT:    v_or_b32_e32 v15, v15, v17
+; GFX942-NEXT:    v_or_b32_e32 v14, v14, v16
+; GFX942-NEXT:    v_lshlrev_b64 v[12:13], v12, v[0:1]
+; GFX942-NEXT:    v_cmp_gt_u32_e64 s[0:1], 64, v18
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v18
+; GFX942-NEXT:    v_mov_b64_e32 v[16:17], 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[0:1]
+; GFX942-NEXT:    v_lshlrev_b64 v[14:15], v18, v[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v13, v13, v3, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e64 v12, v12, v2, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e64 v15, 0, v15, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v14, 0, v14, s[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], 0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[2:3], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB1_5
+; GFX942-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-NEXT:    v_sub_u32_e32 v18, 64, v8
+; GFX942-NEXT:    v_lshrrev_b64 v[16:17], v8, v[0:1]
+; GFX942-NEXT:    v_lshlrev_b64 v[18:19], v18, v[2:3]
+; GFX942-NEXT:    v_or_b32_e32 v18, v16, v18
+; GFX942-NEXT:    v_subrev_u32_e32 v16, 64, v8
+; GFX942-NEXT:    v_or_b32_e32 v19, v17, v19
+; GFX942-NEXT:    v_lshrrev_b64 v[16:17], v16, v[2:3]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v8
+; GFX942-NEXT:    v_mov_b64_e32 v[24:25], 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v21, v17, v1, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v16, v18, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v20, v1, v0, s[0:1]
+; GFX942-NEXT:    v_lshrrev_b64 v[0:1], v8, v[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e32 v23, 0, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v22, 0, v0, vcc
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, -1, v4
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v5, vcc
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
+; GFX942-NEXT:  .LBB1_3: ; %udiv-do-while
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v16, 31, v21
+; GFX942-NEXT:    v_lshlrev_b64 v[20:21], 1, v[20:21]
+; GFX942-NEXT:    v_lshrrev_b32_e32 v28, 31, v13
+; GFX942-NEXT:    v_lshlrev_b64 v[22:23], 1, v[22:23]
+; GFX942-NEXT:    v_or_b32_e32 v20, v20, v28
+; GFX942-NEXT:    v_or_b32_e32 v22, v22, v16
+; GFX942-NEXT:    v_sub_co_u32_e32 v16, vcc, v0, v20
+; GFX942-NEXT:    v_lshlrev_b64 v[26:27], 1, v[14:15]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v16, vcc, v1, v21, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v14, 31, v15
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v16, vcc, v2, v22, vcc
+; GFX942-NEXT:    v_lshlrev_b64 v[12:13], 1, v[12:13]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v16, vcc, v3, v23, vcc
+; GFX942-NEXT:    v_or3_b32 v12, v12, v14, v18
+; GFX942-NEXT:    v_or_b32_e32 v14, v24, v26
+; GFX942-NEXT:    v_ashrrev_i32_e32 v24, 31, v16
+; GFX942-NEXT:    v_or_b32_e32 v15, v25, v27
+; GFX942-NEXT:    v_and_b32_e32 v16, 1, v24
+; GFX942-NEXT:    v_and_b32_e32 v25, v24, v7
+; GFX942-NEXT:    v_and_b32_e32 v26, v24, v6
+; GFX942-NEXT:    v_and_b32_e32 v27, v24, v5
+; GFX942-NEXT:    v_and_b32_e32 v24, v24, v4
+; GFX942-NEXT:    v_sub_co_u32_e32 v20, vcc, v20, v24
+; GFX942-NEXT:    v_or3_b32 v13, v13, 0, v19
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v21, vcc, v21, v27, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v22, vcc, v22, v26, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v23, vcc, v23, v25, vcc
+; GFX942-NEXT:    v_add_co_u32_e32 v8, vcc, -1, v8
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, -1, v9, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v10, vcc, -1, v10, vcc
+; GFX942-NEXT:    v_or_b32_e32 v24, v8, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v11, vcc, -1, v11, vcc
+; GFX942-NEXT:    v_or_b32_e32 v25, v9, v11
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[24:25]
+; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[24:25], v[16:17]
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX942-NEXT:  ; %bb.4: ; %Flow
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:  .LBB1_5: ; %Flow2
+; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    v_lshlrev_b64 v[0:1], 1, v[14:15]
+; GFX942-NEXT:    v_lshrrev_b32_e32 v4, 31, v15
+; GFX942-NEXT:    v_lshlrev_b64 v[2:3], 1, v[12:13]
+; GFX942-NEXT:    v_or3_b32 v9, v3, 0, v19
+; GFX942-NEXT:    v_or3_b32 v8, v2, v4, v18
+; GFX942-NEXT:    v_or_b32_e32 v11, v17, v1
+; GFX942-NEXT:    v_or_b32_e32 v10, v16, v0
+; GFX942-NEXT:  .LBB1_6: ; %Flow3
+; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v0, v10
+; GFX942-NEXT:    v_mov_b32_e32 v1, v11
+; GFX942-NEXT:    v_mov_b32_e32 v2, v8
+; GFX942-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-O0-LABEL: v_udiv_i128_vv:
+; GFX942-O0:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-O0-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-O0-NEXT:    scratch_store_dword off, v29, s32 offset:168 ; 4-byte Folded Spill
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, v6
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a0, v4 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v0
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v0, a0 ; Reload Reuse
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v7
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a1, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a2, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a3, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a4, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a5, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a6, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a7, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a8, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v8, v1
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v8, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-O0-NEXT:    v_or_b32_e64 v0, v4, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-O0-NEXT:    ; implicit-def: $vgpr29 : SGPR spill to VGPR lane
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s2, 0
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s3, 1
+; GFX942-O0-NEXT:    v_cmp_eq_u64_e64 s[0:1], v[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v13
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v15
+; GFX942-O0-NEXT:    v_or_b32_e64 v7, v3, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v14
+; GFX942-O0-NEXT:    v_or_b32_e64 v14, v2, v0
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v7
+; GFX942-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[14:15], s[2:3]
+; GFX942-O0-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v5, v5
+; GFX942-O0-NEXT:    s_mov_b32 s7, 32
+; GFX942-O0-NEXT:    v_add_u32_e64 v5, v5, s7
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX942-O0-NEXT:    v_min_u32_e64 v6, v5, v6
+; GFX942-O0-NEXT:    s_mov_b32 s6, 0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, s6
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v4, v4
+; GFX942-O0-NEXT:    v_add_u32_e64 v4, v4, s7
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v8, v8
+; GFX942-O0-NEXT:    v_min_u32_e64 v8, v4, v8
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s6
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v4
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], 64
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[8:9], v[8:9], 0, s[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v9
+; GFX942-O0-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[10:11], s[8:9]
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[8:9]
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v8, v5, v6, s[8:9]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v4
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v4, v0
+; GFX942-O0-NEXT:    v_add_u32_e64 v4, v4, s7
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v5, v1
+; GFX942-O0-NEXT:    v_min_u32_e64 v6, v4, v5
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s6
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v4, v2
+; GFX942-O0-NEXT:    v_add_u32_e64 v4, v4, s7
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v10, v3
+; GFX942-O0-NEXT:    v_min_u32_e64 v10, v4, v10
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s6
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[10:11], v[10:11], 0, s[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v11
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[12:13], s[4:5]
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v10, v5, v6, s[4:5]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-O0-NEXT:    s_mov_b32 s6, s2
+; GFX942-O0-NEXT:    s_mov_b32 s7, s3
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v7
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v6, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, s6
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v5, v6, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, s7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, s7
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v5, v6, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a9, v5 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a10, v4 ; Reload Reuse
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v6
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a11, v9 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a12, v8 ; Reload Reuse
+; GFX942-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[8:9], s[2:3]
+; GFX942-O0-NEXT:    s_mov_b64 s[8:9], 0x7f
+; GFX942-O0-NEXT:    v_cmp_gt_u64_e64 s[10:11], v[4:5], s[8:9]
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[10:11]
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[10:11], v[8:9], s[2:3]
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[10:11]
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX942-O0-NEXT:    v_and_b32_e64 v6, 1, v6
+; GFX942-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, 1
+; GFX942-O0-NEXT:    s_or_b64 s[4:5], s[0:1], s[4:5]
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], -1
+; GFX942-O0-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX942-O0-NEXT:    s_mov_b32 s10, s9
+; GFX942-O0-NEXT:    v_xor_b32_e64 v6, v6, s10
+; GFX942-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX942-O0-NEXT:    v_xor_b32_e64 v4, v4, s8
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[2:3], v[4:5], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s7
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s7
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-O0-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a13, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a14, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a15, v1 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a16, v0 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s0, 2
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s1, 3
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a17, v29 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-O0-NEXT:    s_cbranch_execz .LBB1_3
+; GFX942-O0-NEXT:    s_branch .LBB1_8
+; GFX942-O0-NEXT:  .LBB1_1: ; %Flow
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a17 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    v_readlane_b32 s0, v29, 4
+; GFX942-O0-NEXT:    v_readlane_b32 s1, v29, 5
+; GFX942-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-O0-NEXT:  ; %bb.2: ; %Flow
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v7, a18 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v6, a19 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v5, a20 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v4, a21 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v3, a22 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v2, a23 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v1, a24 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v0, a25 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a26, v7 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a27, v6 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a28, v5 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a29, v4 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a30, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a31, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_branch .LBB1_5
+; GFX942-O0-NEXT:  .LBB1_3: ; %Flow2
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a17 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    v_readlane_b32 s0, v29, 2
+; GFX942-O0-NEXT:    v_readlane_b32 s1, v29, 3
+; GFX942-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v1, a13 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v0, a14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v3, a15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v2, a16 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:16 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:8 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_branch .LBB1_9
+; GFX942-O0-NEXT:  .LBB1_4: ; %udiv-loop-exit
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[6:7], off, s32 offset:24 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[8:9], off, s32 offset:32 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:40 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[4:5], off, s32 offset:48 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    s_mov_b32 s0, 1
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[2:3], s0, v[0:1]
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[10:11], s0, v[4:5]
+; GFX942-O0-NEXT:    s_mov_b32 s0, 63
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[0:1], s0, v[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v9
+; GFX942-O0-NEXT:    v_or3_b32 v4, v4, v5, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX942-O0-NEXT:    v_or3_b32 v0, v0, v1, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v7
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v6
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a13, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a14, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a15, v1 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a16, v0 ; Reload Reuse
+; GFX942-O0-NEXT:    s_branch .LBB1_3
+; GFX942-O0-NEXT:  .LBB1_5: ; %Flow1
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a17 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    v_readlane_b32 s0, v29, 6
+; GFX942-O0-NEXT:    v_readlane_b32 s1, v29, 7
+; GFX942-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v1, a26 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v0, a27 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v3, a28 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v2, a29 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v5, a30 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v4, a31 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[6:7], off, s32 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:32 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:24 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:48 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:40 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_branch .LBB1_4
+; GFX942-O0-NEXT:  .LBB1_6: ; %udiv-do-while
+; GFX942-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a17 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    v_readlane_b32 s2, v29, 8
+; GFX942-O0-NEXT:    v_readlane_b32 s3, v29, 9
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[6:7], off, s32 offset:56 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:64 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[26:27], off, s32 offset:72 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[2:3], off, s32 offset:80 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[16:17], off, s32 offset:88 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[8:9], off, s32 offset:96 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[24:25], off, s32 offset:104 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[10:11], off, s32 offset:112 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v19, a3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v18, a4 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v21, a1 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v20, a2 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[14:15], off, s32 offset:120 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[22:23], off, s32 offset:128 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    s_mov_b32 s0, 63
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(6)
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[12:13], s0, v[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX942-O0-NEXT:    s_mov_b32 s1, 1
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[26:27], s1, v[26:27]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v27
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 killed $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v26
+; GFX942-O0-NEXT:    v_or_b32_e64 v12, v5, v12
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v4
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[26:27], s1, v[2:3]
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[4:5], s0, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v27
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v26
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v3, v4
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[2:3], s1, v[0:1]
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[26:27], s1, v[6:7]
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[0:1], s0, v[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v27
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v28, v25
+; GFX942-O0-NEXT:    v_or3_b32 v6, v6, v7, v28
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v26
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v24
+; GFX942-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX942-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v10
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v22
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v23
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v11, v4, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-O0-NEXT:    v_ashrrev_i64 v[14:15], s0, v[12:13]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], 1
+; GFX942-O0-NEXT:    s_mov_b32 s4, s1
+; GFX942-O0-NEXT:    v_and_b32_e64 v12, v7, s4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX942-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX942-O0-NEXT:    v_and_b32_e64 v14, v11, s0
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[12:13], 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v21
+; GFX942-O0-NEXT:    v_and_b32_e64 v22, v7, v22
+; GFX942-O0-NEXT:    v_and_b32_e64 v20, v11, v20
+; GFX942-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v21, v22
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v19
+; GFX942-O0-NEXT:    v_and_b32_e64 v7, v7, v22
+; GFX942-O0-NEXT:    v_and_b32_e64 v22, v11, v18
+; GFX942-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v23, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v22
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v23
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v19
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v8
+; GFX942-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], -1
+; GFX942-O0-NEXT:    s_mov_b32 s1, s4
+; GFX942-O0-NEXT:    s_mov_b32 s0, s5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, s1
+; GFX942-O0-NEXT:    v_add_co_u32_e32 v20, vcc, v11, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, s0
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, s1
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, s0
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v21, v9
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v8
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[8:9], v[16:17]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[10:11], v[20:21]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v17
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v21
+; GFX942-O0-NEXT:    v_or_b32_e64 v18, v18, v19
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, v20
+; GFX942-O0-NEXT:    v_or_b32_e64 v16, v16, v17
+; GFX942-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX942-O0-NEXT:    v_cmp_eq_u64_e64 s[0:1], v[16:17], v[12:13]
+; GFX942-O0-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[16:17], v[2:3]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a18, v17 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a19, v16 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[16:17], v[0:1]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a20, v17 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a21, v16 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[16:17], v[14:15]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a22, v17 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a23, v16 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[16:17], v[12:13]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a24, v17 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a25, v16 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s2, 4
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s3, 5
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s2, 8
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s3, 9
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a17, v29 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[14:15], s32 offset:112 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[12:13], s32 offset:104 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[10:11], s32 offset:96 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[8:9], s32 offset:88 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:80 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:72 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:64 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:56 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-O0-NEXT:    s_cbranch_execnz .LBB1_6
+; GFX942-O0-NEXT:    s_branch .LBB1_1
+; GFX942-O0-NEXT:  .LBB1_7: ; %udiv-preheader
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a17 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:136 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[2:3], off, s32 offset:144 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[8:9], off, s32 offset:152 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[10:11], off, s32 offset:160 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v17, a1 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v16, a2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v13, a3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v12, a4 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v15, a5 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v14, a6 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v19, a7 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v18, a8 ; Reload Reuse
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[18:19]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    s_mov_b32 s0, 64
+; GFX942-O0-NEXT:    v_sub_u32_e64 v20, s0, v4
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v21
+; GFX942-O0-NEXT:    v_or_b32_e64 v5, v5, v22
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v20
+; GFX942-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v7
+; GFX942-O0-NEXT:    v_cmp_lt_u32_e64 s[2:3], v4, s0
+; GFX942-O0-NEXT:    v_sub_u32_e64 v5, v4, s0
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[20:21], v5, v[14:15]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v21
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v22, s[2:3]
+; GFX942-O0-NEXT:    s_mov_b32 s0, 0
+; GFX942-O0-NEXT:    v_cmp_eq_u32_e64 s[0:1], v4, s0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v19
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v22, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v20
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v18
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[0:1]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[14:15]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v5
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-O0-NEXT:    s_mov_b32 s4, s1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, s4
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-O0-NEXT:    s_mov_b32 s4, s0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[2:3]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v13
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], -1
+; GFX942-O0-NEXT:    s_mov_b32 s3, s4
+; GFX942-O0-NEXT:    s_mov_b32 s2, s5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, s3
+; GFX942-O0-NEXT:    v_add_co_u32_e32 v16, vcc, v15, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, s2
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v15, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, s3
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v18, vcc, v14, v15, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, s2
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v14, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v13
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v12
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[14:15], s[0:1]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[18:19], s32 offset:120 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[16:17], s32 offset:128 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s0, 8
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s1, 9
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a17, v29 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[14:15], s32 offset:112 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[12:13], s32 offset:104 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[10:11], s32 offset:96 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[8:9], s32 offset:88 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:80 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:72 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:64 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:56 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_branch .LBB1_6
+; GFX942-O0-NEXT:  .LBB1_8: ; %udiv-bb1
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a17 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v7, a7 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v6, a8 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v11, a5 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v10, a6 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v5, a11 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v4, a12 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v1, a9 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v0, a10 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], 1
+; GFX942-O0-NEXT:    s_mov_b32 s1, s2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-O0-NEXT:    s_mov_b32 s0, s3
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-O0-NEXT:    s_mov_b32 s4, s2
+; GFX942-O0-NEXT:    s_mov_b32 s5, s3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s0
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s5
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:152 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:160 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_mov_b32 s0, 0x7f
+; GFX942-O0-NEXT:    v_sub_u32_e64 v2, s0, v3
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[10:11]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v5
+; GFX942-O0-NEXT:    s_mov_b32 s0, 64
+; GFX942-O0-NEXT:    v_sub_u32_e64 v13, s0, v2
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[14:15], v13, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v15
+; GFX942-O0-NEXT:    v_or_b32_e64 v12, v12, v13
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX942-O0-NEXT:    v_cmp_lt_u32_e64 s[0:1], v2, s0
+; GFX942-O0-NEXT:    s_mov_b32 s6, 63
+; GFX942-O0-NEXT:    v_sub_u32_e64 v3, s6, v3
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[12:13], v3, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[0:1]
+; GFX942-O0-NEXT:    s_mov_b32 s6, 0
+; GFX942-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v2, s6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v11
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[6:7]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[6:7], v2, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, s4
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v3, v6, s[0:1]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:144 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:136 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v8
+; GFX942-O0-NEXT:    v_or_b32_e64 v0, v0, v1
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[0:1], v[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a26, v7 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a27, v6 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a28, v5 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a29, v4 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a30, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a31, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-O0-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX942-O0-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s2, 6
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s3, 7
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a17, v29 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-O0-NEXT:    s_cbranch_execz .LBB1_5
+; GFX942-O0-NEXT:    s_branch .LBB1_7
+; GFX942-O0-NEXT:  .LBB1_9: ; %udiv-end
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[4:5], off, s32 offset:16 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[6:7], off, s32 offset:8 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    s_mov_b32 s0, 32
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[0:1], s0, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[2:3], s0, v[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX942-O0-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-O0-NEXT:    scratch_load_dword v29, off, s32 offset:168 ; 4-byte Folded Reload
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-G-LABEL: v_udiv_i128_vv:
+; GFX942-G:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-G-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-G-NEXT:    v_mov_b32_e32 v8, v0
+; GFX942-G-NEXT:    v_mov_b32_e32 v9, v1
+; GFX942-G-NEXT:    v_or_b32_e32 v0, v4, v6
+; GFX942-G-NEXT:    v_or_b32_e32 v1, v5, v7
+; GFX942-G-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-G-NEXT:    v_or_b32_e32 v0, v8, v2
+; GFX942-G-NEXT:    v_or_b32_e32 v1, v9, v3
+; GFX942-G-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[0:1]
+; GFX942-G-NEXT:    v_ffbh_u32_e32 v1, v4
+; GFX942-G-NEXT:    v_ffbh_u32_e32 v0, v5
+; GFX942-G-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX942-G-NEXT:    v_ffbh_u32_e32 v10, v6
+; GFX942-G-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX942-G-NEXT:    v_ffbh_u32_e32 v1, v7
+; GFX942-G-NEXT:    v_add_u32_e32 v10, 32, v10
+; GFX942-G-NEXT:    v_add_u32_e32 v0, 64, v0
+; GFX942-G-NEXT:    v_min_u32_e32 v1, v1, v10
+; GFX942-G-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[6:7]
+; GFX942-G-NEXT:    v_ffbh_u32_e32 v10, v8
+; GFX942-G-NEXT:    v_add_u32_e32 v10, 32, v10
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[2:3]
+; GFX942-G-NEXT:    v_ffbh_u32_e32 v1, v9
+; GFX942-G-NEXT:    v_ffbh_u32_e32 v11, v2
+; GFX942-G-NEXT:    v_min_u32_e32 v1, v1, v10
+; GFX942-G-NEXT:    v_ffbh_u32_e32 v10, v3
+; GFX942-G-NEXT:    v_add_u32_e32 v11, 32, v11
+; GFX942-G-NEXT:    v_add_u32_e32 v1, 64, v1
+; GFX942-G-NEXT:    v_min_u32_e32 v10, v10, v11
+; GFX942-G-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[2:3]
+; GFX942-G-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-G-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v1, v10, v1, s[2:3]
+; GFX942-G-NEXT:    v_sub_co_u32_e64 v12, s[2:3], v0, v1
+; GFX942-G-NEXT:    v_mov_b64_e32 v[0:1], 0x7f
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e64 v13, s[2:3], 0, 0, s[2:3]
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_subb_co_u32_e64 v14, s[2:3], 0, 0, s[2:3]
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_subb_co_u32_e64 v15, s[2:3], 0, 0, s[2:3]
+; GFX942-G-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[12:13], v[0:1]
+; GFX942-G-NEXT:    v_or_b32_e32 v17, v13, v15
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX942-G-NEXT:    v_cmp_lt_u64_e64 s[2:3], 0, v[14:15]
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; GFX942-G-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[14:15]
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[2:3]
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX942-G-NEXT:    v_or_b32_e32 v18, v1, v0
+; GFX942-G-NEXT:    v_xor_b32_e32 v0, 0x7f, v12
+; GFX942-G-NEXT:    v_or_b32_e32 v16, v0, v14
+; GFX942-G-NEXT:    v_and_b32_e32 v0, 1, v18
+; GFX942-G-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v0, v8, 0, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v1, v9, 0, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v10, v2, 0, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v11, v3, 0, vcc
+; GFX942-G-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GFX942-G-NEXT:    v_or_b32_e32 v16, v18, v16
+; GFX942-G-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX942-G-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX942-G-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-G-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX942-G-NEXT:    s_cbranch_execz .LBB1_6
+; GFX942-G-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-G-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v12
+; GFX942-G-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v13, vcc
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v14, vcc
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v15, vcc
+; GFX942-G-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-G-NEXT:    v_sub_co_u32_e32 v18, vcc, 0x7f, v12
+; GFX942-G-NEXT:    v_sub_u32_e32 v14, 64, v18
+; GFX942-G-NEXT:    v_add_u32_e32 v19, 0xffffffc0, v18
+; GFX942-G-NEXT:    v_lshrrev_b64 v[14:15], v14, v[8:9]
+; GFX942-G-NEXT:    v_lshlrev_b64 v[16:17], v18, v[2:3]
+; GFX942-G-NEXT:    v_lshlrev_b64 v[12:13], v18, v[8:9]
+; GFX942-G-NEXT:    v_or_b32_e32 v20, v14, v16
+; GFX942-G-NEXT:    v_or_b32_e32 v21, v15, v17
+; GFX942-G-NEXT:    v_lshlrev_b64 v[14:15], v19, v[8:9]
+; GFX942-G-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v16, 0, v12, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v17, 0, v13, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v12, v14, v20, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v13, v15, v21, vcc
+; GFX942-G-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v18
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v18, v12, v2, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v19, v13, v3, vcc
+; GFX942-G-NEXT:    v_mov_b64_e32 v[14:15], s[6:7]
+; GFX942-G-NEXT:    v_mov_b64_e32 v[12:13], s[4:5]
+; GFX942-G-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-G-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GFX942-G-NEXT:    s_cbranch_execz .LBB1_5
+; GFX942-G-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-G-NEXT:    v_sub_u32_e32 v20, 64, v0
+; GFX942-G-NEXT:    v_add_u32_e32 v22, 0xffffffc0, v0
+; GFX942-G-NEXT:    v_lshrrev_b64 v[14:15], v0, v[8:9]
+; GFX942-G-NEXT:    v_lshlrev_b64 v[20:21], v20, v[2:3]
+; GFX942-G-NEXT:    v_lshrrev_b64 v[12:13], v0, v[2:3]
+; GFX942-G-NEXT:    v_or_b32_e32 v14, v14, v20
+; GFX942-G-NEXT:    v_or_b32_e32 v15, v15, v21
+; GFX942-G-NEXT:    v_lshrrev_b64 v[2:3], v22, v[2:3]
+; GFX942-G-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
+; GFX942-G-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-G-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v2, v2, v14, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v3, v3, v15, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v20, 0, v12, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e32 v21, 0, v13, vcc
+; GFX942-G-NEXT:    v_add_co_u32_e32 v22, vcc, -1, v4
+; GFX942-G-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v5, vcc
+; GFX942-G-NEXT:    v_mov_b64_e32 v[14:15], s[6:7]
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_addc_co_u32_e32 v24, vcc, -1, v6, vcc
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v8, v2, v8, s[0:1]
+; GFX942-G-NEXT:    v_cndmask_b32_e64 v9, v3, v9, s[0:1]
+; GFX942-G-NEXT:    v_addc_co_u32_e32 v25, vcc, -1, v7, vcc
+; GFX942-G-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-G-NEXT:    v_mov_b64_e32 v[12:13], s[4:5]
+; GFX942-G-NEXT:  .LBB1_3: ; %udiv-do-while
+; GFX942-G-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-G-NEXT:    v_lshlrev_b64 v[20:21], 1, v[20:21]
+; GFX942-G-NEXT:    v_lshrrev_b32_e32 v2, 31, v9
+; GFX942-G-NEXT:    v_lshlrev_b64 v[14:15], 1, v[8:9]
+; GFX942-G-NEXT:    v_or_b32_e32 v20, v20, v2
+; GFX942-G-NEXT:    v_lshrrev_b32_e32 v2, 31, v19
+; GFX942-G-NEXT:    v_or_b32_e32 v14, v14, v2
+; GFX942-G-NEXT:    v_lshlrev_b64 v[18:19], 1, v[18:19]
+; GFX942-G-NEXT:    v_lshrrev_b32_e32 v2, 31, v17
+; GFX942-G-NEXT:    v_or_b32_e32 v18, v18, v2
+; GFX942-G-NEXT:    v_sub_co_u32_e32 v2, vcc, v22, v14
+; GFX942-G-NEXT:    v_lshlrev_b64 v[8:9], 1, v[16:17]
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v2, vcc, v23, v15, vcc
+; GFX942-G-NEXT:    v_or_b32_e32 v16, v12, v8
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v2, vcc, v24, v20, vcc
+; GFX942-G-NEXT:    v_add_co_u32_e64 v0, s[0:1], -1, v0
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v2, vcc, v25, v21, vcc
+; GFX942-G-NEXT:    v_ashrrev_i32_e32 v8, 31, v2
+; GFX942-G-NEXT:    v_and_b32_e32 v2, 1, v8
+; GFX942-G-NEXT:    v_addc_co_u32_e64 v1, s[0:1], -1, v1, s[0:1]
+; GFX942-G-NEXT:    v_or_b32_e32 v17, v13, v9
+; GFX942-G-NEXT:    v_mov_b64_e32 v[12:13], v[2:3]
+; GFX942-G-NEXT:    v_and_b32_e32 v2, v8, v4
+; GFX942-G-NEXT:    v_addc_co_u32_e64 v10, s[0:1], -1, v10, s[0:1]
+; GFX942-G-NEXT:    v_and_b32_e32 v9, v8, v5
+; GFX942-G-NEXT:    v_and_b32_e32 v26, v8, v6
+; GFX942-G-NEXT:    v_and_b32_e32 v27, v8, v7
+; GFX942-G-NEXT:    v_sub_co_u32_e32 v8, vcc, v14, v2
+; GFX942-G-NEXT:    v_addc_co_u32_e64 v11, s[0:1], -1, v11, s[0:1]
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v9, vcc, v15, v9, vcc
+; GFX942-G-NEXT:    v_or_b32_e32 v14, v0, v10
+; GFX942-G-NEXT:    v_or_b32_e32 v15, v1, v11
+; GFX942-G-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[14:15]
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v20, vcc, v20, v26, vcc
+; GFX942-G-NEXT:    s_or_b64 s[4:5], s[0:1], s[4:5]
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_subb_co_u32_e32 v21, vcc, v21, v27, vcc
+; GFX942-G-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-G-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX942-G-NEXT:  ; %bb.4: ; %Flow
+; GFX942-G-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-G-NEXT:  .LBB1_5: ; %Flow2
+; GFX942-G-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-G-NEXT:    v_lshlrev_b64 v[0:1], 1, v[16:17]
+; GFX942-G-NEXT:    v_lshlrev_b64 v[10:11], 1, v[18:19]
+; GFX942-G-NEXT:    v_lshrrev_b32_e32 v2, 31, v17
+; GFX942-G-NEXT:    v_or_b32_e32 v10, v10, v2
+; GFX942-G-NEXT:    v_or_b32_e32 v0, v12, v0
+; GFX942-G-NEXT:    v_or_b32_e32 v1, v13, v1
+; GFX942-G-NEXT:  .LBB1_6: ; %Flow3
+; GFX942-G-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-G-NEXT:    v_mov_b32_e32 v2, v10
+; GFX942-G-NEXT:    v_mov_b32_e32 v3, v11
+; GFX942-G-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-G-O0-LABEL: v_udiv_i128_vv:
+; GFX942-G-O0:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-G-O0-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-G-O0-NEXT:    scratch_store_dword off, v32, s32 offset:180 ; 4-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, v1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, v2
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, v3
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v10
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, v8
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a0, v3 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a1, v2 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a2, v1 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a3, v0 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, v4
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, v5
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v11, v7
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a4, v11 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a5, v10 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a6, v9 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a7, v8 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[12:13], v[10:11]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, v12
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, v13
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v4, v4, v7
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v6, v5, v6
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_eq_u64_e64 s[2:3], v[4:5], v[6:7]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[12:13], v[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, v12
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, v13
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v4, v4, v7
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v6, v5, v6
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[4:5], v[6:7]
+; GFX942-G-O0-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[6:7], v[8:9]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[8:9], v[10:11]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[4:5], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[8:9], v[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, v7
+; GFX942-G-O0-NEXT:    v_ffbh_u32_e64 v4, v4
+; GFX942-G-O0-NEXT:    v_ffbh_u32_e64 v5, v5
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, 32
+; GFX942-G-O0-NEXT:    v_add_u32_e64 v5, v5, v6
+; GFX942-G-O0-NEXT:    v_min_u32_e64 v4, v4, v5
+; GFX942-G-O0-NEXT:    s_mov_b32 s6, 64
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, s6
+; GFX942-G-O0-NEXT:    v_add_u32_e64 v5, v4, v5
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, v9
+; GFX942-G-O0-NEXT:    v_ffbh_u32_e64 v4, v4
+; GFX942-G-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, 32
+; GFX942-G-O0-NEXT:    v_add_u32_e64 v6, v6, v7
+; GFX942-G-O0-NEXT:    v_min_u32_e64 v4, v4, v6
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX942-G-O0-NEXT:    s_mov_b32 s10, 0
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[8:9], v[6:7]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v11
+; GFX942-G-O0-NEXT:    v_ffbh_u32_e64 v5, v5
+; GFX942-G-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, 32
+; GFX942-G-O0-NEXT:    v_add_u32_e64 v6, v6, v7
+; GFX942-G-O0-NEXT:    v_min_u32_e64 v5, v5, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-G-O0-NEXT:    v_add_u32_e64 v6, v5, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, v8
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-G-O0-NEXT:    v_ffbh_u32_e64 v5, v5
+; GFX942-G-O0-NEXT:    v_ffbh_u32_e64 v7, v7
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, 32
+; GFX942-G-O0-NEXT:    v_add_u32_e64 v7, v7, v8
+; GFX942-G-O0-NEXT:    v_min_u32_e64 v5, v5, v7
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[4:5]
+; GFX942-G-O0-NEXT:    s_mov_b32 s9, 0
+; GFX942-G-O0-NEXT:    s_mov_b32 s7, 0
+; GFX942-G-O0-NEXT:    s_mov_b32 s8, 0
+; GFX942-G-O0-NEXT:    s_mov_b32 s6, 0
+; GFX942-G-O0-NEXT:    v_sub_co_u32_e64 v6, s[4:5], v4, v5
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a8, v6 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, s10
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, s10
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v5, s[4:5], v4, v5, s[4:5]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a9, v5 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, s9
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, s8
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v8, s[4:5], v4, v7, s[4:5]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a10, v8 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, s7
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, s6
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v7, s[4:5], v4, v7, s[4:5]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a11, v7 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 s[4:5], 0x7f
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v12, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v13, v5
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, v8
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v11, v7
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[14:15], s[4:5]
+; GFX942-G-O0-NEXT:    v_cmp_gt_u64_e64 s[8:9], v[12:13], v[14:15]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_gt_u64_e64 s[6:7], v[10:11], v[12:13]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[10:11], v[12:13]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, 1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v9, v4, v9, s[8:9]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, 1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[6:7]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v9, v4, v9, s[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, 1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v10, s[2:3]
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v4, v4, v9
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, 0x7f
+; GFX942-G-O0-NEXT:    v_xor_b32_e64 v6, v6, s2
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v6, v6, v8
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v5, v5, v7
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_eq_u64_e64 s[0:1], v[6:7], v[8:9]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[6:7], v[0:1]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[8:9], v[2:3]
+; GFX942-G-O0-NEXT:    v_and_b32_e32 v0, 1, v4
+; GFX942-G-O0-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v3, s[2:3]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[2:3]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-G-O0-NEXT:    v_and_b32_e32 v2, 1, v4
+; GFX942-G-O0-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v2
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v6, v5, v6, s[2:3]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[2:3]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, 1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[0:1]
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-G-O0-NEXT:    v_and_b32_e32 v4, 1, v4
+; GFX942-G-O0-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX942-G-O0-NEXT:    s_mov_b64 s[2:3], -1
+; GFX942-G-O0-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a12, v3 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a13, v2 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a14, v1 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a15, v0 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-G-O0-NEXT:    ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s0, 0
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s1, 1
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a16, v32 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-G-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-G-O0-NEXT:    s_cbranch_execz .LBB1_3
+; GFX942-G-O0-NEXT:    s_branch .LBB1_8
+; GFX942-G-O0-NEXT:  .LBB1_1: ; %Flow
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v32, a16 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-G-O0-NEXT:    v_readlane_b32 s0, v32, 2
+; GFX942-G-O0-NEXT:    v_readlane_b32 s1, v32, 3
+; GFX942-G-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-G-O0-NEXT:  ; %bb.2: ; %Flow
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v7, a17 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v6, a18 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v5, a19 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v4, a20 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v3, a21 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v2, a22 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v1, a23 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v0, a24 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a25, v7 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a26, v6 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a27, v5 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a28, v4 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a29, v3 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a30, v2 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a31, v1 ; Reload Reuse
+; GFX942-G-O0-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_branch .LBB1_5
+; GFX942-G-O0-NEXT:  .LBB1_3: ; %Flow2
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v32, a16 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-G-O0-NEXT:    v_readlane_b32 s0, v32, 0
+; GFX942-G-O0-NEXT:    v_readlane_b32 s1, v32, 1
+; GFX942-G-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v3, a12 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v2, a13 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v1, a14 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v0, a15 ; Reload Reuse
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:16 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_branch .LBB1_9
+; GFX942-G-O0-NEXT:  .LBB1_4: ; %udiv-loop-exit
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[12:15], off, s32 offset:32 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[4:7], off, s32 offset:48 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-G-O0-NEXT:    s_mov_b32 s0, 1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[10:11], v0, v[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[0:1], v0, v[4:5]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr2 killed $exec
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
+; GFX942-G-O0-NEXT:    s_mov_b32 s0, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-G-O0-NEXT:    v_lshrrev_b32_e64 v5, v2, v3
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[6:7], v[12:13]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[8:9], v[14:15]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, v10
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v11
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v0, v0, v6
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v2, v1, v2
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX942-G-O0-NEXT:    v_or3_b32 v4, v4, v5, v6
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[2:3], v[4:5]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a12, v3 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a13, v2 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a14, v1 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a15, v0 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_branch .LBB1_3
+; GFX942-G-O0-NEXT:  .LBB1_5: ; %Flow1
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v32, a16 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-G-O0-NEXT:    v_readlane_b32 s0, v32, 4
+; GFX942-G-O0-NEXT:    v_readlane_b32 s1, v32, 5
+; GFX942-G-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v3, a25 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v2, a26 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v1, a27 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v0, a28 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v7, a29 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v6, a30 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v5, a31 ; Reload Reuse
+; GFX942-G-O0-NEXT:    scratch_load_dword v4, off, s32 ; 4-byte Folded Reload
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[4:7], s32 offset:32 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:48 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_branch .LBB1_4
+; GFX942-G-O0-NEXT:  .LBB1_6: ; %udiv-do-while
+; GFX942-G-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v32, a16 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-G-O0-NEXT:    v_readlane_b32 s2, v32, 6
+; GFX942-G-O0-NEXT:    v_readlane_b32 s3, v32, 7
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[12:15], off, s32 offset:64 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[2:5], off, s32 offset:80 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[16:19], off, s32 offset:96 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[28:31], off, s32 offset:112 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v27, a4 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v26, a5 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v25, a6 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v24, a7 ; Reload Reuse
+; GFX942-G-O0-NEXT:    scratch_load_dword v6, off, s32 offset:128 ; 4-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dword v8, off, s32 offset:132 ; 4-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dword v10, off, s32 offset:136 ; 4-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dword v11, off, s32 offset:140 ; 4-byte Folded Reload
+; GFX942-G-O0-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(6)
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[0:1], v[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[20:21], v[4:5]
+; GFX942-G-O0-NEXT:    s_mov_b32 s4, 1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[2:3], v2, v[0:1]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[4:5], v4, v[20:21]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr0 killed $exec
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec
+; GFX942-G-O0-NEXT:    s_mov_b32 s5, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942-G-O0-NEXT:    v_lshrrev_b32_e64 v1, v0, v1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr4_vgpr5 killed $exec
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v7, v0, v1
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr12_vgpr13 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[0:1], v[14:15]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr0 killed $exec
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec
+; GFX942-G-O0-NEXT:    s_mov_b32 s5, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942-G-O0-NEXT:    v_lshrrev_b32_e64 v1, v0, v1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v4, v0, v1
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[12:13], v[14:15]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[22:23], v0, v[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[0:1], v0, v[12:13]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr2 killed $exec
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
+; GFX942-G-O0-NEXT:    s_mov_b32 s4, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-G-O0-NEXT:    v_lshrrev_b32_e64 v13, v2, v3
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v12, v0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(4)
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[14:15], v[28:29]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[20:21], v[30:31]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, v14
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v15
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v14, v22
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v23
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v0, v0, v14
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v2, v1, v2
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v14, v20
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v21
+; GFX942-G-O0-NEXT:    v_or3_b32 v12, v12, v13, v14
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v13, v2
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-G-O0-NEXT:    v_sub_co_u32_e64 v11, s[4:5], v11, v4
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[4:5], v10, v9, s[4:5]
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v8, s[4:5], v8, v7, s[4:5]
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[4:5], v6, v5, s[4:5]
+; GFX942-G-O0-NEXT:    s_mov_b32 s4, 31
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, s4
+; GFX942-G-O0-NEXT:    v_ashrrev_i32_e64 v8, v6, v10
+; GFX942-G-O0-NEXT:    s_mov_b32 s4, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, s4
+; GFX942-G-O0-NEXT:    v_ashrrev_i32_e64 v6, v6, v10
+; GFX942-G-O0-NEXT:    s_mov_b32 s5, 1
+; GFX942-G-O0-NEXT:    s_mov_b32 s4, 0
+; GFX942-G-O0-NEXT:    v_and_b32_e64 v12, v8, s5
+; GFX942-G-O0-NEXT:    v_and_b32_e64 v10, v8, s4
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v13, v10
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[10:11], s[0:1]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[14:15], v[10:11]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[22:23], v[24:25]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[20:21], v[26:27]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v11, v22
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, v23
+; GFX942-G-O0-NEXT:    v_and_b32_e64 v11, v8, v11
+; GFX942-G-O0-NEXT:    v_and_b32_e64 v10, v8, v10
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, v20
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v20, v21
+; GFX942-G-O0-NEXT:    v_and_b32_e64 v8, v6, v8
+; GFX942-G-O0-NEXT:    v_and_b32_e64 v6, v6, v20
+; GFX942-G-O0-NEXT:    v_sub_co_u32_e64 v4, s[4:5], v4, v11
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v10, s[4:5], v9, v10, s[4:5]
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v9, s[4:5], v7, v8, s[4:5]
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_subb_co_u32_e64 v8, s[4:5], v5, v6, s[4:5]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, v9
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, v8
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v11, v16
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, v17
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, v18
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, v19
+; GFX942-G-O0-NEXT:    s_mov_b32 s4, -1
+; GFX942-G-O0-NEXT:    s_mov_b32 s8, -1
+; GFX942-G-O0-NEXT:    s_mov_b32 s7, -1
+; GFX942-G-O0-NEXT:    s_mov_b32 s6, -1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v16, s4
+; GFX942-G-O0-NEXT:    v_add_co_u32_e64 v16, s[4:5], v11, v16
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v11, s8
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v17, s[4:5], v10, v11, s[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, s7
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v19, s[4:5], v9, v10, s[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, s6
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v18, s[4:5], v8, v9, s[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, v16
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, v17
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v10, v19
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v11, v18
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v16, v16, v19
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v18, v17, v18
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[18:19], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_eq_u64_e64 s[0:1], v[16:17], v[18:19]
+; GFX942-G-O0-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[18:19], v[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[16:17], v[0:1]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a17, v19 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a18, v18 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a19, v17 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a20, v16 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[18:19], v[14:15]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[16:17], v[12:13]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a21, v19 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a22, v18 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a23, v17 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a24, v16 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s2, 2
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s3, 3
+; GFX942-G-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s2, 6
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s3, 7
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a16, v32 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[12:15], s32 offset:112 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[8:11], s32 offset:96 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[4:7], s32 offset:80 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:64 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-G-O0-NEXT:    s_cbranch_execnz .LBB1_6
+; GFX942-G-O0-NEXT:    s_branch .LBB1_1
+; GFX942-G-O0-NEXT:  .LBB1_7: ; %udiv-preheader
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v32, a16 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[0:3], off, s32 offset:144 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[8:11], off, s32 offset:160 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v19, a4 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v18, a5 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v17, a6 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v16, a7 ; Reload Reuse
+; GFX942-G-O0-NEXT:    scratch_load_dword v12, off, s32 offset:176 ; 4-byte Folded Reload
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v7, a0 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v6, a1 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v5, a2 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v4, a3 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b32 s0, 64
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[14:15], v[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[20:21], v[6:7]
+; GFX942-G-O0-NEXT:    s_mov_b32 s1, 0xffffffc0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-G-O0-NEXT:    v_add_u32_e64 v4, v12, v4
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, s0
+; GFX942-G-O0-NEXT:    v_sub_u32_e64 v5, v5, v12
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, s0
+; GFX942-G-O0-NEXT:    v_cmp_lt_u32_e64 s[0:1], v12, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, s2
+; GFX942-G-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v12, v6
+; GFX942-G-O0-NEXT:    v_lshrrev_b64 v[6:7], v12, v[20:21]
+; GFX942-G-O0-NEXT:    v_lshrrev_b64 v[22:23], v12, v[14:15]
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[24:25], v5, v[20:21]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v13, v22
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v23
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v22, v24
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v12, v25
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v13, v13, v22
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v12, v5, v12
+; GFX942-G-O0-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-G-O0-NEXT:    v_lshrrev_b64 v[20:21], v4, v[20:21]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, v20
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v21
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v13, s[0:1]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v12, s[0:1]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v13, v14
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v12, v15
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v13, s[2:3]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v12, v5, v12, s[2:3]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v13, v6
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[0:1]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[0:1]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v13, v6
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[6:7], v[12:13]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v14, v17
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v13, v18
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v12, v19
+; GFX942-G-O0-NEXT:    s_mov_b32 s0, -1
+; GFX942-G-O0-NEXT:    s_mov_b32 s6, -1
+; GFX942-G-O0-NEXT:    s_mov_b32 s3, -1
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, -1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v16, s0
+; GFX942-G-O0-NEXT:    v_add_co_u32_e64 v15, s[0:1], v15, v16
+; GFX942-G-O0-NEXT:    scratch_store_dword off, v15, s32 offset:140 ; 4-byte Folded Spill
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v15, s6
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v14, s[0:1], v14, v15, s[0:1]
+; GFX942-G-O0-NEXT:    scratch_store_dword off, v14, s32 offset:136 ; 4-byte Folded Spill
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v14, s3
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v13, s[0:1], v13, v14, s[0:1]
+; GFX942-G-O0-NEXT:    scratch_store_dword off, v13, s32 offset:132 ; 4-byte Folded Spill
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v13, s2
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v12, s[0:1], v12, v13, s[0:1]
+; GFX942-G-O0-NEXT:    scratch_store_dword off, v12, s32 offset:128 ; 4-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; GFX942-G-O0-NEXT:    s_mov_b64 s[2:3], s[4:5]
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s4, 6
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s5, 7
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a16, v32 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[14:15], s[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[12:15], s32 offset:112 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[8:11], s32 offset:96 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[4:7], s32 offset:80 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:64 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_branch .LBB1_6
+; GFX942-G-O0-NEXT:  .LBB1_8: ; %udiv-bb1
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v32, a16 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v11, a0 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v10, a1 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v9, a2 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v8, a3 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v1, a8 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v0, a11 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v2, a10 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_read_b32 v3, a9 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, 1
+; GFX942-G-O0-NEXT:    s_mov_b32 s6, 0
+; GFX942-G-O0-NEXT:    s_mov_b32 s5, 0
+; GFX942-G-O0-NEXT:    s_mov_b32 s4, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-G-O0-NEXT:    v_add_co_u32_e64 v4, s[2:3], v1, v4
+; GFX942-G-O0-NEXT:    scratch_store_dword off, v4, s32 offset:176 ; 4-byte Folded Spill
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, s6
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v5, s[2:3], v3, v5, s[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, s5
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v7, s[2:3], v2, v3, s[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v6, s[2:3], v0, v2, s[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v12, v4
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v13, v5
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v14, v7
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v15, v6
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[12:15], s32 offset:160 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, 0x7f
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-G-O0-NEXT:    v_sub_co_u32_e64 v3, s[2:3], v0, v1
+; GFX942-G-O0-NEXT:    s_mov_b32 s3, 64
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[12:13], v[8:9]
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, 0xffffffc0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-G-O0-NEXT:    v_add_u32_e64 v2, v3, v0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s3
+; GFX942-G-O0-NEXT:    v_sub_u32_e64 v8, v0, v3
+; GFX942-G-O0-NEXT:    s_mov_b32 s2, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s3
+; GFX942-G-O0-NEXT:    v_cmp_lt_u32_e64 s[4:5], v3, v0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-G-O0-NEXT:    v_cmp_eq_u32_e64 s[2:3], v3, v0
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[0:1], v3, v[12:13]
+; GFX942-G-O0-NEXT:    v_lshrrev_b64 v[14:15], v8, v[12:13]
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[16:17], v3, v[10:11]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, v14
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, v15
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v9, v9, v14
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v3, v3, v8
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[12:13], v2, v[12:13]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, v0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v8, s[4:5]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v2, v1, v2, s[4:5]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v8, v12
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v13
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[4:5]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, v10
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, v11
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
+; GFX942-G-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[2:3]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v9, v2
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[2:3], v[8:9]
+; GFX942-G-O0-NEXT:    scratch_store_dwordx4 off, v[0:3], s32 offset:144 ; 16-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-G-O0-NEXT:    s_mov_b64 s[6:7], s[0:1]
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v4, v4, v7
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v6, v5, v6
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[6:7], s[0:1]
+; GFX942-G-O0-NEXT:    v_cmp_ne_u64_e64 s[0:1], v[4:5], v[6:7]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a25, v3 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a26, v2 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a27, v1 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a28, v0 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b64_e32 v[2:3], s[6:7]
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a29, v3 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a30, v2 ; Reload Reuse
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a31, v1 ; Reload Reuse
+; GFX942-G-O0-NEXT:    scratch_store_dword off, v0, s32 ; 4-byte Folded Spill
+; GFX942-G-O0-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-G-O0-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX942-G-O0-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s2, 4
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_writelane_b32 v32, s3, 5
+; GFX942-G-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-G-O0-NEXT:    v_accvgpr_write_b32 a16, v32 ; Reload Reuse
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-G-O0-NEXT:    s_cbranch_execz .LBB1_5
+; GFX942-G-O0-NEXT:    s_branch .LBB1_7
+; GFX942-G-O0-NEXT:  .LBB1_9: ; %udiv-end
+; GFX942-G-O0-NEXT:    scratch_load_dwordx4 v[4:7], off, s32 offset:16 ; 16-byte Folded Reload
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-G-O0-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-G-O0-NEXT:    scratch_load_dword v32, off, s32 offset:180 ; 4-byte Folded Reload
+; GFX942-G-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-G-O0-NEXT:    s_setpc_b64 s[30:31]
   %div = udiv i128 %lhs, %rhs
   ret i128 %div
 }
@@ -4528,6 +8462,152 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v2, v2, v4
 ; GFX9-G-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_sdiv_i128_v_pow2k:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX942-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-NEXT:    v_lshrrev_b64 v[4:5], 31, v[4:5]
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, v1, v5, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v4, 1, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_lshlrev_b64 v[0:1], 31, v[2:3]
+; GFX942-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 1, v3
+; GFX942-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-O0-LABEL: v_sdiv_i128_v_pow2k:
+; GFX942-O0:       ; %bb.0:
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-O0-NEXT:    s_mov_b32 s0, 63
+; GFX942-O0-NEXT:    v_ashrrev_i64 v[4:5], s0, v[4:5]
+; GFX942-O0-NEXT:    s_mov_b32 s1, 31
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[6:7], s1, v[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v7
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-O0-NEXT:    s_mov_b32 s2, s4
+; GFX942-O0-NEXT:    s_mov_b32 s0, s5
+; GFX942-O0-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v5
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v6, vcc, v2, v4, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v6
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-O0-NEXT:    s_mov_b32 s0, 33
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[0:1], s0, v[0:1]
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_lshl_or_b32 v0, v2, s1, v0
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[4:5], v[6:7]
+; GFX942-O0-NEXT:    v_ashrrev_i64 v[4:5], s0, v[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-O0-NEXT:    s_mov_b32 s0, 1
+; GFX942-O0-NEXT:    v_alignbit_b32 v1, v1, v2, s0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX942-O0-NEXT:    s_mov_b32 s0, 32
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[4:5], s0, v[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-G-LABEL: v_sdiv_i128_v_pow2k:
+; GFX942-G:       ; %bb.0:
+; GFX942-G-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-G-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX942-G-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-G-NEXT:    v_lshrrev_b64 v[4:5], 31, v[4:5]
+; GFX942-G-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v4
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_addc_co_u32_e32 v0, vcc, v1, v5, vcc
+; GFX942-G-NEXT:    s_nop 1
+; GFX942-G-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v2, vcc
+; GFX942-G-NEXT:    v_lshrrev_b32_e32 v2, 1, v0
+; GFX942-G-NEXT:    s_nop 0
+; GFX942-G-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v3, vcc
+; GFX942-G-NEXT:    v_lshlrev_b64 v[0:1], 31, v[4:5]
+; GFX942-G-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX942-G-NEXT:    v_ashrrev_i32_e32 v3, 31, v5
+; GFX942-G-NEXT:    v_ashrrev_i32_e32 v2, 1, v5
+; GFX942-G-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-G-O0-LABEL: v_sdiv_i128_v_pow2k:
+; GFX942-G-O0:       ; %bb.0:
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-G-O0-NEXT:    s_mov_b32 s0, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-G-O0-NEXT:    v_ashrrev_i32_e64 v0, v0, v3
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v6, v0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, v0
+; GFX942-G-O0-NEXT:    s_mov_b32 s0, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-G-O0-NEXT:    v_lshrrev_b64 v[6:7], v0, v[6:7]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, v7
+; GFX942-G-O0-NEXT:    s_mov_b32 s4, 0
+; GFX942-G-O0-NEXT:    s_mov_b32 s1, 0
+; GFX942-G-O0-NEXT:    v_add_co_u32_e64 v4, s[2:3], v4, v5
+; GFX942-G-O0-NEXT:    s_nop 1
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v1, s[2:3], v1, v0, s[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v6, s[2:3], v2, v0, s[2:3]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-G-O0-NEXT:    s_nop 0
+; GFX942-G-O0-NEXT:    v_addc_co_u32_e64 v4, s[2:3], v3, v0, s[2:3]
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-G-O0-NEXT:    s_mov_b32 s1, 1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-G-O0-NEXT:    v_lshrrev_b32_e64 v0, v0, v1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[6:7], v2, v[6:7]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v7
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v0, v0, v3
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v1, v1, v2
+; GFX942-G-O0-NEXT:    s_mov_b32 s0, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-G-O0-NEXT:    v_ashrrev_i32_e64 v3, v2, v4
+; GFX942-G-O0-NEXT:    s_mov_b32 s0, 1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-G-O0-NEXT:    v_ashrrev_i32_e64 v2, v2, v4
+; GFX942-G-O0-NEXT:    s_setpc_b64 s[30:31]
   %div = sdiv i128 %lhs, 8589934592
   ret i128 %div
 }
@@ -4609,6 +8689,80 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 0
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX9-G-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_udiv_i128_v_pow2k:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
+; GFX942-NEXT:    v_lshlrev_b64 v[0:1], 31, v[2:3]
+; GFX942-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-O0-LABEL: v_udiv_i128_v_pow2k:
+; GFX942-O0:       ; %bb.0:
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v6
+; GFX942-O0-NEXT:    s_mov_b32 s0, 33
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[0:1], s0, v[0:1]
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    s_mov_b32 s1, 31
+; GFX942-O0-NEXT:    v_lshl_or_b32 v0, v4, s1, v0
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], v[6:7]
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v7
+; GFX942-O0-NEXT:    s_mov_b32 s0, 1
+; GFX942-O0-NEXT:    v_alignbit_b32 v1, v1, v4, s0
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-G-LABEL: v_udiv_i128_v_pow2k:
+; GFX942-G:       ; %bb.0:
+; GFX942-G-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-G-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
+; GFX942-G-NEXT:    v_lshlrev_b64 v[0:1], 31, v[2:3]
+; GFX942-G-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX942-G-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
+; GFX942-G-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-G-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-G-O0-LABEL: v_udiv_i128_v_pow2k:
+; GFX942-G-O0:       ; %bb.0:
+; GFX942-G-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-G-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-G-O0-NEXT:    s_mov_b32 s0, 1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-G-O0-NEXT:    v_lshrrev_b32_e64 v0, v0, v1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-G-O0-NEXT:    s_mov_b32 s0, 31
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-G-O0-NEXT:    v_lshlrev_b64 v[6:7], v2, v[4:5]
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v4, v6
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, v7
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v0, v0, v4
+; GFX942-G-O0-NEXT:    v_or_b32_e64 v1, v1, v2
+; GFX942-G-O0-NEXT:    s_mov_b32 s0, 1
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-G-O0-NEXT:    v_lshrrev_b32_e64 v2, v2, v3
+; GFX942-G-O0-NEXT:    s_mov_b32 s0, 0
+; GFX942-G-O0-NEXT:    v_mov_b32_e32 v3, s0
+; GFX942-G-O0-NEXT:    s_setpc_b64 s[30:31]
   %div = udiv i128 %lhs, 8589934592
   ret i128 %div
 }
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 7ea98a16e3b84..e6c7cca3881fb 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -2,6 +2,9 @@
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck -check-prefix=SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -o - %s | FileCheck -check-prefix=GISEL %s
 
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck -check-prefix=GFX942-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck -check-prefix=GFX942-GISEL %s
+
 define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; SDAG-LABEL: v_sdiv_v2i128_vv:
 ; SDAG:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
@@ -826,6 +829,920 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v8, v7, s[4:5]
 ; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v9, v7, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-SDAG-LABEL: v_sdiv_v2i128_vv:
+; GFX942-SDAG:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v20, vcc, 0, v0
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v16, 31, v3
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v21, vcc, 0, v1, vcc
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v18, 31, v11
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v22, vcc, 0, v2, vcc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v25, 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v23, vcc, 0, v3, vcc
+; GFX942-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0x7f
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, v16
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v21, v1, v21, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v20, v0, v20, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v23, v3, v23, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v22, v2, v22, vcc
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, 0, v8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, v18
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, 0, v9, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v2, vcc, 0, v10, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v3, vcc, 0, v11, vcc
+; GFX942-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v9, v9, v1, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v0, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v11, v11, v3, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v10, v10, v2, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v1, v9, v11
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v0, v8, v10
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v1, v21, v23
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v0, v20, v22
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[0:1]
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v0, v10
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v0, 32, v0
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v1, v11
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v2, v0, v1
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v0, v8
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v0, 32, v0
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v1, v9
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v24, v0, v1
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[24:25], 0, 64
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v0, v22
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, v1, 0, vcc
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v0, 32, v0
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v1, v23
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v26, v0, v1
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v0, v20
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v0, 32, v0
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v1, v21
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v24, v0, v1
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[24:25], 0, 64
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[22:23]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v26, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v1, v1, 0, vcc
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v26, vcc, v2, v0
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v27, vcc, v3, v1, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v24, vcc, 0, v25, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v25, vcc, 0, v25, vcc
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[26:27]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v3, v27, v25
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[24:25]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[24:25]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v0, 0x7f, v26
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v2, v0, v24
+; GFX942-SDAG-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v1, v23, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v0, v22, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, v21, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, v20, 0, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB0_6
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %udiv-bb15
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v26
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[30:31], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v27, vcc
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v27, 0x7f, v26
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v24, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v24, v0, v2
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v25, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v25, v1, v3
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v28, 64, v27
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[24:25]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[24:25], v27, v[22:23]
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[28:29], v28, v[20:21]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v28, v24, v28
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v24, 63, v26
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v29, v25, v29
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[24:25], v24, v[20:21]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e64 s[0:1], 64, v27
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v27
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[26:27], v27, v[20:21]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v25, v25, v29, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v24, v24, v28, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v25, v25, v23, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v24, v24, v22, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v27, 0, v27, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v26, 0, v26, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[28:29], 0
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-SDAG-NEXT:    s_xor_b64 s[2:3], exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB0_5
+; GFX942-SDAG-NEXT:  ; %bb.2: ; %udiv-preheader4
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v30, 64, v0
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[28:29], v0, v[20:21]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[30:31], v30, v[22:23]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v30, v28, v30
+; GFX942-SDAG-NEXT:    v_subrev_u32_e32 v28, 64, v0
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v31, v29, v31
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[28:29], v28, v[22:23]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[36:37], 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v29, v29, v31, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v33, v29, v21, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v21, v28, v30, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v32, v21, v20, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[20:21], v0, v[22:23]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v35, 0, v21, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v34, 0, v20, vcc
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v20, vcc, -1, v8
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[30:31], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v21, vcc, -1, v9, vcc
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v22, vcc, -1, v10, vcc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v29, 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v11, vcc
+; GFX942-SDAG-NEXT:  .LBB0_3: ; %udiv-do-while3
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v28, 31, v33
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[32:33], 1, v[32:33]
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v48, 31, v25
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[34:35], 1, v[34:35]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v32, v32, v48
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v34, v34, v28
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v28, vcc, v20, v32
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[38:39], 1, v[26:27]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v28, vcc, v21, v33, vcc
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v26, 31, v27
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v28, vcc, v22, v34, vcc
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[24:25], 1, v[24:25]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v28, vcc, v23, v35, vcc
+; GFX942-SDAG-NEXT:    v_or3_b32 v24, v24, v26, v30
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v26, v36, v38
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v36, 31, v28
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v27, v37, v39
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v28, 1, v36
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v37, v36, v11
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v38, v36, v10
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v39, v36, v9
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v36, v36, v8
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v32, vcc, v32, v36
+; GFX942-SDAG-NEXT:    v_or3_b32 v25, v25, 0, v31
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v33, vcc, v33, v39, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v34, vcc, v34, v38, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v35, vcc, v35, v37, vcc
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, -1, v0
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v2, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v36, v0, v2
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v37, v1, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[36:37]
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[36:37], v[28:29]
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB0_3
+; GFX942-SDAG-NEXT:  ; %bb.4: ; %Flow13
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:  .LBB0_5: ; %Flow14
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[2:3], 1, v[26:27]
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v8, 31, v27
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[0:1], 1, v[24:25]
+; GFX942-SDAG-NEXT:    v_or3_b32 v1, v1, 0, v31
+; GFX942-SDAG-NEXT:    v_or3_b32 v0, v0, v8, v30
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v3, v29, v3
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v2, v28, v2
+; GFX942-SDAG-NEXT:  .LBB0_6: ; %Flow16
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v20, vcc, 0, v4
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v21, vcc, 0, v5, vcc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v25, 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v22, vcc, 0, v6, vcc
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v10, 31, v15
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v23, vcc, 0, v7, vcc
+; GFX942-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[6:7]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0x7f
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, v8
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v21, v5, v21, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v20, v4, v20, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v23, v7, v23, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v22, v6, v22, vcc
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v4, vcc, 0, v12
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, v10
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v5, vcc, 0, v13, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v6, vcc, 0, v14, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v7, vcc, 0, v15, vcc
+; GFX942-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v5, v13, v5, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v4, v12, v4, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v7, v15, v7, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v6, v14, v6, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v13, v5, v7
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v12, v4, v6
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v13, v21, v23
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v12, v20, v22
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[12:13]
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v12, v6
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v12, 32, v12
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v13, v7
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v14, v12, v13
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v12, v4
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v12, 32, v12
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v13, v5
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v24, v12, v13
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[12:13], v[24:25], 0, 64
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v14, v12, v14, vcc
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v12, v22
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v15, v13, 0, vcc
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v12, 32, v12
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v13, v23
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v26, v12, v13
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v12, v20
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v12, 32, v12
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v13, v21
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v24, v12, v13
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[12:13], v[24:25], 0, 64
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[22:23]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v12, v12, v26, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v13, v13, 0, vcc
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v26, vcc, v14, v12
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v27, vcc, v15, v13, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v14, vcc, 0, v25, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v15, vcc, 0, v25, vcc
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[26:27]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v25, v27, v15
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v12, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v13, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v12, v13, v12, vcc
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v12, 1, v12
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v12
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v12, 0x7f, v26
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v24, v12, v14
+; GFX942-SDAG-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[24:25]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v13, v23, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v12, v22, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v25, v21, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v24, v20, 0, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB0_12
+; GFX942-SDAG-NEXT:  ; %bb.7: ; %udiv-bb1
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v12, vcc, 1, v26
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[30:31], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v27, vcc
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v27, 0x7f, v26
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v14, vcc, 0, v14, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v24, v12, v14
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v15, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v25, v13, v15
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v28, 64, v27
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[24:25]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[24:25], v27, v[22:23]
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[28:29], v28, v[20:21]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v28, v24, v28
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v24, 63, v26
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v29, v25, v29
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[24:25], v24, v[20:21]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e64 s[0:1], 64, v27
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v27
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[26:27], v27, v[20:21]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v25, v25, v29, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v24, v24, v28, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v25, v25, v23, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v24, v24, v22, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v27, 0, v27, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v26, 0, v26, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[28:29], 0
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-SDAG-NEXT:    s_xor_b64 s[2:3], exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB0_11
+; GFX942-SDAG-NEXT:  ; %bb.8: ; %udiv-preheader
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v30, 64, v12
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[28:29], v12, v[20:21]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[30:31], v30, v[22:23]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v30, v28, v30
+; GFX942-SDAG-NEXT:    v_subrev_u32_e32 v28, 64, v12
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v31, v29, v31
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[28:29], v28, v[22:23]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v12
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v12
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[36:37], 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v29, v29, v31, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v33, v29, v21, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v21, v28, v30, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v32, v21, v20, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[20:21], v12, v[22:23]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v35, 0, v21, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v34, 0, v20, vcc
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v20, vcc, -1, v4
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[30:31], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v21, vcc, -1, v5, vcc
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v22, vcc, -1, v6, vcc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v29, 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v7, vcc
+; GFX942-SDAG-NEXT:  .LBB0_9: ; %udiv-do-while
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v28, 31, v33
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[32:33], 1, v[32:33]
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v48, 31, v25
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[34:35], 1, v[34:35]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v32, v32, v48
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v34, v34, v28
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v28, vcc, v20, v32
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[38:39], 1, v[26:27]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v28, vcc, v21, v33, vcc
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v26, 31, v27
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v28, vcc, v22, v34, vcc
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[24:25], 1, v[24:25]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v28, vcc, v23, v35, vcc
+; GFX942-SDAG-NEXT:    v_or3_b32 v24, v24, v26, v30
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v26, v36, v38
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v36, 31, v28
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v27, v37, v39
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v28, 1, v36
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v37, v36, v7
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v38, v36, v6
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v39, v36, v5
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v36, v36, v4
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v32, vcc, v32, v36
+; GFX942-SDAG-NEXT:    v_or3_b32 v25, v25, 0, v31
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v33, vcc, v33, v39, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v34, vcc, v34, v38, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v35, vcc, v35, v37, vcc
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v12, vcc, -1, v12
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v13, vcc, -1, v13, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v14, vcc, -1, v14, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v36, v12, v14
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v15, vcc, -1, v15, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v37, v13, v15
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[36:37]
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[36:37], v[28:29]
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB0_9
+; GFX942-SDAG-NEXT:  ; %bb.10: ; %Flow
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:  .LBB0_11: ; %Flow11
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[4:5], 1, v[26:27]
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v12, 31, v27
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[6:7], 1, v[24:25]
+; GFX942-SDAG-NEXT:    v_or3_b32 v13, v7, 0, v31
+; GFX942-SDAG-NEXT:    v_or3_b32 v12, v6, v12, v30
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v25, v29, v5
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v24, v28, v4
+; GFX942-SDAG-NEXT:  .LBB0_12: ; %Flow12
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v5, v18, v16
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v4, v19, v17
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v7, v0, v5
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v0, v2, v5
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v6, v1, v4
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v1, v3, v4
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v5
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v2, vcc, v7, v5, vcc
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v7, v11, v9
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v3, vcc, v6, v4, vcc
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v6, v10, v8
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v4, v24, v6
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v5, v25, v7
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v6
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v9, v12, v6
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v8, v13, v7
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v6, vcc, v9, v6, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v7, vcc, v8, v7, vcc
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: v_sdiv_v2i128_vv:
+; GFX942-GISEL:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v28, 31, v3
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v0, v28, v0
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v1, v28, v1
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v18, vcc, v0, v28
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v2, v28, v2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v19, vcc, v1, v28, vcc
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v29, 31, v11
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v3, v28, v3
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v20, vcc, v2, v28, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v0, v29, v8
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v21, vcc, v3, v28, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v1, v29, v9
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v30, vcc, v0, v29
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v2, v29, v10
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v31, vcc, v1, v29, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v3, v29, v11
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v16, vcc, v2, v29, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v30, v16
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v17, vcc, v3, v29, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v31, v17
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v18, v20
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v19, v21
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[0:1]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v1, v30
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v0, v31
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v2, v16
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v1, v17
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, 64, v0
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[16:17]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v2, v18
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v1, v19
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v3, v20
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v2, v21
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v3, 32, v3
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v1, 64, v1
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v2, v2, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[20:21]
+; GFX942-GISEL-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[2:3]
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e64 v8, s[2:3], v0, v1
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], 0x7f
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v9, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v2, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v3, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[8:9], v[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v23, v9, v3
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_lt_u64_e64 s[2:3], 0, v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v24, v1, v0
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v0, 0x7f, v8
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v22, v0, v2
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 1, v24
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, v18, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, v19, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v10, v20, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v11, v21, 0, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[22:23]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v22, v24, v22
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v22
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB0_6
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v8
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v9, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v24, vcc, 0x7f, v8
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v10, 64, v24
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v25, 0xffffffc0, v24
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[10:11], v10, v[18:19]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[22:23], v24, v[20:21]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[8:9], v24, v[18:19]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v26, v10, v22
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v27, v11, v23
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[10:11], v25, v[18:19]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v22, 0, v8, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v23, 0, v9, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v8, v10, v26, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v9, v11, v27, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v24, v8, v20, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v25, v9, v21, vcc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[4:5]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB0_5
+; GFX942-GISEL-NEXT:  ; %bb.2: ; %udiv-preheader4
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v26, 64, v0
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v32, 0xffffffc0, v0
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[10:11], v0, v[18:19]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[26:27], v26, v[20:21]
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[8:9], v0, v[20:21]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v26, v10, v26
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v27, v11, v27
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[10:11], v32, v[20:21]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v26, 0, v8, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v27, 0, v9, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v32, vcc, -1, v30
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v33, vcc, -1, v31, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v20, v10, v18, s[0:1]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v21, v11, v19, s[0:1]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v34, vcc, -1, v16, vcc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v35, vcc, -1, v17, vcc
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v19, 0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[4:5]
+; GFX942-GISEL-NEXT:  .LBB0_3: ; %udiv-do-while3
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[26:27], 1, v[26:27]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v18, 31, v21
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[10:11], 1, v[20:21]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v26, v26, v18
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v18, 31, v25
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v10, v10, v18
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[20:21], 1, v[22:23]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v22, v8, v20
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v8, vcc, v32, v10
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[24:25], 1, v[24:25]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v8, vcc, v33, v11, vcc
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v18, 31, v23
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v8, vcc, v34, v26, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e64 v0, s[0:1], -1, v0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v8, vcc, v35, v27, vcc
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v20, 31, v8
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v24, v24, v18
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v18, 1, v20
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v1, s[0:1], -1, v1, s[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v23, v9, v21
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[18:19]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v18, v20, v30
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v2, s[0:1], -1, v2, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v21, v20, v31
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v36, v20, v16
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v37, v20, v17
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v20, vcc, v10, v18
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v3, s[0:1], -1, v3, s[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v21, vcc, v11, v21, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v10, v0, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v11, v1, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[10:11]
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v26, vcc, v26, v36, vcc
+; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], s[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v27, vcc, v27, v37, vcc
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB0_3
+; GFX942-GISEL-NEXT:  ; %bb.4: ; %Flow13
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:  .LBB0_5: ; %Flow14
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[0:1], 1, v[22:23]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[10:11], 1, v[24:25]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 31, v23
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v10, v10, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX942-GISEL-NEXT:  .LBB0_6: ; %Flow16
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v24, 31, v7
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v2, v24, v4
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v3, v24, v5
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v16, vcc, v2, v24
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v4, v24, v6
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v17, vcc, v3, v24, vcc
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v25, 31, v15
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v5, v24, v7
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v18, vcc, v4, v24, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v2, v25, v12
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v19, vcc, v5, v24, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v3, v25, v13
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v26, vcc, v2, v25
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v4, v25, v14
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v27, vcc, v3, v25, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v5, v25, v15
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v12, vcc, v4, v25, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v2, v26, v12
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v13, vcc, v5, v25, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v3, v27, v13
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v2, v16, v18
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v3, v17, v19
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[2:3]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v3, v26
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v2, v27
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v3, 32, v3
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v4, v12
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v2, v2, v3
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v3, v13
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v4, 32, v4
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 64, v2
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v3, v3, v4
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[12:13]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v4, v16
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v4, 32, v4
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[2:3]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v3, v17
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v5, v18
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v3, v3, v4
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v4, v19
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v3, 64, v3
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[18:19]
+; GFX942-GISEL-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[2:3]
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e64 v6, s[2:3], v2, v3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], 0x7f
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v7, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v14, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v15, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_lt_u64_e64 s[2:3], 0, v[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[14:15]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[2:3]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v20, v3, v2
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v2, 0x7f, v6
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v4, 1, v20
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v2, v2, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v3, v7, v15
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v4, v16, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v5, v17, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v8, v18, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v9, v19, 0, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v2, v20, v2
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB0_12
+; GFX942-GISEL-NEXT:  ; %bb.7: ; %udiv-bb1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v2, vcc, 1, v6
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v7, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v14, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v15, vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v20, vcc, 0x7f, v6
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v8, 64, v20
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v21, 0xffffffc0, v20
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[8:9], v8, v[16:17]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[14:15], v20, v[18:19]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[6:7], v20, v[16:17]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v22, v8, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v23, v9, v15
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[8:9], v21, v[16:17]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v20
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v14, 0, v6, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v15, 0, v7, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v6, v8, v22, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v7, v9, v23, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v20
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v20, v6, v18, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v21, v7, v19, vcc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB0_11
+; GFX942-GISEL-NEXT:  ; %bb.8: ; %udiv-preheader
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v22, 64, v2
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v30, 0xffffffc0, v2
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[8:9], v2, v[16:17]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[22:23], v22, v[18:19]
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[6:7], v2, v[18:19]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v22, v8, v22
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v23, v9, v23
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[8:9], v30, v[18:19]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v22, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v23, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v22, 0, v6, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v23, 0, v7, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v30, vcc, -1, v26
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v31, vcc, -1, v27, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v18, v8, v16, s[0:1]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v19, v9, v17, s[0:1]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v32, vcc, -1, v12, vcc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v33, vcc, -1, v13, vcc
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
+; GFX942-GISEL-NEXT:  .LBB0_9: ; %udiv-do-while
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[22:23], 1, v[22:23]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v16, 31, v19
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[8:9], 1, v[18:19]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v22, v22, v16
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v16, 31, v21
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v8, v8, v16
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[18:19], 1, v[14:15]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[20:21], 1, v[20:21]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v14, 31, v15
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v20, v20, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v14, v6, v18
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v6, vcc, v30, v8
+; GFX942-GISEL-NEXT:    v_add_co_u32_e64 v2, s[0:1], -1, v2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v6, vcc, v31, v9, vcc
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v3, s[0:1], -1, v3, s[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v6, vcc, v32, v22, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v15, v7, v19
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v6, vcc, v33, v23, vcc
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v18, 31, v6
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v16, 1, v18
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[16:17]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v16, v18, v26
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v4, s[0:1], -1, v4, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v19, v18, v27
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v34, v18, v12
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v35, v18, v13
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v18, vcc, v8, v16
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v5, s[0:1], -1, v5, s[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v19, vcc, v9, v19, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v8, v2, v4
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v9, v3, v5
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[8:9]
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v22, vcc, v22, v34, vcc
+; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], s[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v23, vcc, v23, v35, vcc
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB0_9
+; GFX942-GISEL-NEXT:  ; %bb.10: ; %Flow
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:  .LBB0_11: ; %Flow11
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[2:3], 1, v[14:15]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[8:9], 1, v[20:21]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 31, v15
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v8, v8, v4
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v4, v6, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v5, v7, v3
+; GFX942-GISEL-NEXT:  .LBB0_12: ; %Flow12
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v3, v29, v28
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v3
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v2, v10, v3
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v6, v11, v3
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v3, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v7, v25, v24
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v6, v3, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v4, v4, v7
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v5, v5, v7
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v7
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v6, v8, v7
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v7, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v8, v9, v7
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v7, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v7, vcc, v8, v7, vcc
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %shl = sdiv <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
 }
@@ -1552,6 +2469,781 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_mov_b32_e32 v6, v8
 ; GISEL-NEXT:    v_mov_b32_e32 v7, v9
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-SDAG-LABEL: v_udiv_v2i128_vv:
+; GFX942-SDAG:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, v3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v18, v0
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v1, v9, v11
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v0, v8, v10
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v1, v19, v17
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v0, v18, v16
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[0:1]
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v0, v10
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v0, 32, v0
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v1, v11
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v20, v0, v1
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v0, v8
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v0, 32, v0
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v2, v9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v0, v0, v2
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, 64
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v0, v16
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v0, 32, v0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v20, v2, v20, vcc
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v2, v17
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v22, v0, v2
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v0, v18
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v0, 32, v0
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v2, v19
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v0, v0, v2
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v21, v3, 0, vcc
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, 64
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0x7f
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v22, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v0, v3, 0, vcc
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v20, vcc, v20, v2
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v21, vcc, v21, v0, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v22, vcc, 0, v1, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v23, vcc, 0, v1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[20:21]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[22:23]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[22:23]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v0, 0x7f, v20
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v1, v21, v23
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v0, v0, v22
+; GFX942-SDAG-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, v17, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, v16, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v1, v19, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v0, v18, 0, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB1_6
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %udiv-bb15
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v20
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v26, 0x7f, v20
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v21, vcc
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v21, 64, v26
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v22, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v22, v0, v2
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v23, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v23, v1, v3
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[22:23]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[22:23], v26, v[16:17]
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[24:25], v21, v[18:19]
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v20, 63, v20
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v23, v23, v25
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v22, v22, v24
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[20:21], v20, v[18:19]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e64 s[0:1], 64, v26
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v26
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[24:25], 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v21, v21, v23, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v20, v20, v22, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[22:23], v26, v[18:19]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v21, v21, v17, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v20, v20, v16, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v23, 0, v23, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v22, 0, v22, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-SDAG-NEXT:    s_xor_b64 s[2:3], exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB1_5
+; GFX942-SDAG-NEXT:  ; %bb.2: ; %udiv-preheader4
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v26, 64, v0
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[24:25], v0, v[18:19]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[26:27], v26, v[16:17]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v26, v24, v26
+; GFX942-SDAG-NEXT:    v_subrev_u32_e32 v24, 64, v0
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v27, v25, v27
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[24:25], v24, v[16:17]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[16:17], v0, v[16:17]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v25, v25, v27, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v29, v25, v19, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v19, v24, v26, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v31, 0, v17, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v30, 0, v16, vcc
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v16, vcc, -1, v8
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v28, v19, v18, s[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v17, vcc, -1, v9, vcc
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v18, vcc, -1, v10, vcc
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[32:33], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v19, vcc, -1, v11, vcc
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v25, 0
+; GFX942-SDAG-NEXT:  .LBB1_3: ; %udiv-do-while3
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v24, 31, v29
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[28:29], 1, v[28:29]
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v36, 31, v21
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[30:31], 1, v[30:31]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v28, v28, v36
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v30, v30, v24
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v24, vcc, v16, v28
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[34:35], 1, v[22:23]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v24, vcc, v17, v29, vcc
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v22, 31, v23
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v24, vcc, v18, v30, vcc
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[20:21], 1, v[20:21]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v24, vcc, v19, v31, vcc
+; GFX942-SDAG-NEXT:    v_or3_b32 v20, v20, v22, v26
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v22, v32, v34
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v32, 31, v24
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v23, v33, v35
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v24, 1, v32
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v33, v32, v11
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v34, v32, v10
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v35, v32, v9
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v32, v32, v8
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v28, vcc, v28, v32
+; GFX942-SDAG-NEXT:    v_or3_b32 v21, v21, 0, v27
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v29, vcc, v29, v35, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v30, vcc, v30, v34, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v31, vcc, v31, v33, vcc
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, -1, v0
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v2, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v32, v0, v2
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v3, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v33, v1, v3
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[32:33]
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[32:33], v[24:25]
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX942-SDAG-NEXT:  ; %bb.4: ; %Flow13
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:  .LBB1_5: ; %Flow14
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[0:1], 1, v[22:23]
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v8, 31, v23
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[2:3], 1, v[20:21]
+; GFX942-SDAG-NEXT:    v_or3_b32 v3, v3, 0, v27
+; GFX942-SDAG-NEXT:    v_or3_b32 v2, v2, v8, v26
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v1, v25, v1
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v0, v24, v0
+; GFX942-SDAG-NEXT:  .LBB1_6: ; %Flow16
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v9, v13, v15
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v8, v12, v14
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v9, v5, v7
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v8, v4, v6
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[8:9]
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v8, v14
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v9, v15
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v16, v8, v9
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v8, v12
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v10, v13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v8, v8, v10
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[10:11], v[8:9], 0, 64
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v8, v6
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v16, v10, v16, vcc
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v10, v7
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v18, v8, v10
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v8, v4
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v10, v5
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v8, v8, v10
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v17, v11, 0, vcc
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[10:11], v[8:9], 0, 64
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0x7f
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v10, v10, v18, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v8, v11, 0, vcc
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v16, vcc, v16, v10
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v17, vcc, v17, v8, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v18, vcc, 0, v9, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v19, vcc, 0, v9, vcc
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[16:17]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v11, v17, v19
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v8, 0x7f, v16
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v10, v8, v18
+; GFX942-SDAG-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v9, v7, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v8, v6, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v11, v5, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v10, v4, 0, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB1_12
+; GFX942-SDAG-NEXT:  ; %bb.7: ; %udiv-bb1
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v16
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v22, 0x7f, v16
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v17, vcc
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v17, 64, v22
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v18, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v18, v8, v10
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v19, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v19, v9, v11
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[18:19], v22, v[6:7]
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[20:21], v17, v[4:5]
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v16, 63, v16
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v19, v19, v21
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v18, v18, v20
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[16:17], v16, v[4:5]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e64 s[0:1], 64, v22
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v22
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[20:21], 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v17, v17, v19, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v16, v16, v18, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[18:19], v22, v[4:5]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v17, v17, v7, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v16, v16, v6, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v19, 0, v19, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v18, 0, v18, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[22:23], 0
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-SDAG-NEXT:    s_xor_b64 s[2:3], exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB1_11
+; GFX942-SDAG-NEXT:  ; %bb.8: ; %udiv-preheader
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v22, 64, v8
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[20:21], v8, v[4:5]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[22:23], v22, v[6:7]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v22, v20, v22
+; GFX942-SDAG-NEXT:    v_subrev_u32_e32 v20, 64, v8
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v23, v21, v23
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[20:21], v20, v[6:7]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v8
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[28:29], 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v21, v21, v23, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v25, v21, v5, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v5, v20, v22, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v24, v5, v4, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[4:5], v8, v[6:7]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v27, 0, v5, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v26, 0, v4, vcc
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v4, vcc, -1, v12
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[22:23], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v5, vcc, -1, v13, vcc
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v6, vcc, -1, v14, vcc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v7, vcc, -1, v15, vcc
+; GFX942-SDAG-NEXT:  .LBB1_9: ; %udiv-do-while
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v20, 31, v25
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[24:25], 1, v[24:25]
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v32, 31, v17
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[26:27], 1, v[26:27]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v24, v24, v32
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v26, v26, v20
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v20, vcc, v4, v24
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[30:31], 1, v[18:19]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v20, vcc, v5, v25, vcc
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v18, 31, v19
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v20, vcc, v6, v26, vcc
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[16:17], 1, v[16:17]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v20, vcc, v7, v27, vcc
+; GFX942-SDAG-NEXT:    v_or3_b32 v16, v16, v18, v22
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v18, v28, v30
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v28, 31, v20
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v19, v29, v31
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v20, 1, v28
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v29, v28, v15
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v30, v28, v14
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v31, v28, v13
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v28, v28, v12
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v24, vcc, v24, v28
+; GFX942-SDAG-NEXT:    v_or3_b32 v17, v17, 0, v23
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v25, vcc, v25, v31, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v26, vcc, v26, v30, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v27, vcc, v27, v29, vcc
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v8, vcc, -1, v8
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v9, vcc, -1, v9, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v10, vcc, -1, v10, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v28, v8, v10
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v11, vcc, -1, v11, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v29, v9, v11
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[28:29]
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[28:29], v[20:21]
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB1_9
+; GFX942-SDAG-NEXT:  ; %bb.10: ; %Flow
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:  .LBB1_11: ; %Flow11
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[4:5], 1, v[18:19]
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v8, 31, v19
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[6:7], 1, v[16:17]
+; GFX942-SDAG-NEXT:    v_or3_b32 v9, v7, 0, v23
+; GFX942-SDAG-NEXT:    v_or3_b32 v8, v6, v8, v22
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v11, v21, v5
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v10, v20, v4
+; GFX942-SDAG-NEXT:  .LBB1_12: ; %Flow12
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v6, v8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: v_udiv_v2i128_vv:
+; GFX942-GISEL:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v18, v0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v19, v1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v20, v2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v21, v3
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v8, v10
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v9, v11
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v18, v20
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v19, v21
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[0:1]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v1, v8
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v0, v9
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v2, v10
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v1, v11
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, 64, v0
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[10:11]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v2, v18
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v1, v19
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v3, v20
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v2, v21
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v3, 32, v3
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v1, 64, v1
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v2, v2, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[20:21]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v16, v4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v17, v5
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[2:3]
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e64 v4, s[2:3], v0, v1
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], 0x7f
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v5, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v22, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v23, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[4:5], v[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v25, v5, v23
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_lt_u64_e64 s[2:3], 0, v[22:23]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[22:23]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v26, v1, v0
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v0, 0x7f, v4
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v24, v0, v22
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 1, v26
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, v18, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, v19, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v2, v20, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v3, v21, 0, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[24:25]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v24, v26, v24
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v24
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB1_6
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v4
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v22, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v23, vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v26, vcc, 0x7f, v4
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v22, 64, v26
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v27, 0xffffffc0, v26
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[22:23], v22, v[18:19]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[24:25], v26, v[20:21]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[4:5], v26, v[18:19]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v24, v22, v24
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v25, v23, v25
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[22:23], v27, v[18:19]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v26
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v4, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v22, v22, v24, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v23, v23, v25, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v26
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v26, v22, v20, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v27, v23, v21, vcc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[4:5]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB1_5
+; GFX942-GISEL-NEXT:  ; %bb.2: ; %udiv-preheader4
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v28, 64, v0
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v30, 0xffffffc0, v0
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[24:25], v0, v[18:19]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[28:29], v28, v[20:21]
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[22:23], v0, v[20:21]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v24, v24, v28
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v25, v25, v29
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[20:21], v30, v[20:21]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v20, v20, v24, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v21, v21, v25, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v28, 0, v22, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v29, 0, v23, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v30, vcc, -1, v8
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v31, vcc, -1, v9, vcc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v32, vcc, -1, v10, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v20, v20, v18, s[0:1]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v21, v21, v19, s[0:1]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v33, vcc, -1, v11, vcc
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v19, 0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[4:5]
+; GFX942-GISEL-NEXT:  .LBB1_3: ; %udiv-do-while3
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[28:29], 1, v[28:29]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v18, 31, v21
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[24:25], 1, v[20:21]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v28, v28, v18
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v18, 31, v27
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v24, v24, v18
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v18, vcc, v30, v24
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[20:21], 1, v[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v18, vcc, v31, v25, vcc
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[26:27], 1, v[26:27]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v18, vcc, v32, v28, vcc
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v18, vcc, v33, v29, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v26, v26, v4
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v4, v22, v20
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v20, 31, v18
+; GFX942-GISEL-NEXT:    v_add_co_u32_e64 v0, s[0:1], -1, v0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v18, 1, v20
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v1, s[0:1], -1, v1, s[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v5, v23, v21
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[22:23], v[18:19]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v18, v20, v8
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v2, s[0:1], -1, v2, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v21, v20, v9
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v34, v20, v10
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v35, v20, v11
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v20, vcc, v24, v18
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v3, s[0:1], -1, v3, s[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v21, vcc, v25, v21, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v24, v0, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v25, v1, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[24:25]
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v28, vcc, v28, v34, vcc
+; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], s[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v29, vcc, v29, v35, vcc
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX942-GISEL-NEXT:  ; %bb.4: ; %Flow13
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:  .LBB1_5: ; %Flow14
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[0:1], 1, v[4:5]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[2:3], 1, v[26:27]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v22, v0
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v23, v1
+; GFX942-GISEL-NEXT:  .LBB1_6: ; %Flow16
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v4, v12, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v5, v13, v15
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v4, v16, v6
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v5, v17, v7
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[4:5]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v5, v12
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v4, v13
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v8, v14
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v5, v15
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v4, 64, v4
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v5, v5, v8
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[14:15]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v8, v16
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[2:3]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v5, v17
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v9, v6
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v5, v5, v8
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v8, v7
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v9, 32, v9
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v5, 64, v5
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v8, v8, v9
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[6:7]
+; GFX942-GISEL-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v5, v8, v5, s[2:3]
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e64 v10, s[2:3], v4, v5
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[4:5], 0x7f
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v11, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v18, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v19, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[10:11], v[4:5]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v21, v11, v19
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_lt_u64_e64 s[2:3], 0, v[18:19]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[18:19]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[2:3]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v5, 0, 1, s[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v22, v5, v4
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v4, 0x7f, v10
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v20, v4, v18
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v4, 1, v22
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v4, v16, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v5, v17, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v8, v6, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v9, v7, 0, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[20:21]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v20, v22, v20
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v20
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB1_12
+; GFX942-GISEL-NEXT:  ; %bb.7: ; %udiv-bb1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v4, vcc, 1, v10
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v11, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v8, vcc, 0, v18, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v19, vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v22, vcc, 0x7f, v10
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v18, 64, v22
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v23, 0xffffffc0, v22
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[18:19], v18, v[16:17]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[20:21], v22, v[6:7]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[10:11], v22, v[16:17]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v20, v18, v20
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v21, v19, v21
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[18:19], v23, v[16:17]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v22
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v11, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v18, v18, v20, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v19, v19, v21, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v22
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v22, v18, v6, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v23, v19, v7, vcc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[4:5]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB1_11
+; GFX942-GISEL-NEXT:  ; %bb.8: ; %udiv-preheader
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v24, 64, v4
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v26, 0xffffffc0, v4
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[20:21], v4, v[16:17]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[24:25], v24, v[6:7]
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[18:19], v4, v[6:7]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v20, v20, v24
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v21, v21, v25
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[6:7], v26, v[6:7]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v6, v6, v20, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v7, v7, v21, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v24, 0, v18, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v25, 0, v19, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v26, vcc, -1, v12
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v27, vcc, -1, v13, vcc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v28, vcc, -1, v14, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v16, v6, v16, s[0:1]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v17, v7, v17, s[0:1]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v29, vcc, -1, v15, vcc
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], s[4:5]
+; GFX942-GISEL-NEXT:  .LBB1_9: ; %udiv-do-while
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[24:25], 1, v[24:25]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 31, v17
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[20:21], 1, v[16:17]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v24, v24, v6
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 31, v23
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v20, v20, v6
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[22:23], 1, v[22:23]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 31, v11
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v22, v22, v6
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v6, vcc, v26, v20
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[16:17], 1, v[10:11]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v6, vcc, v27, v21, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v10, v18, v16
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v6, vcc, v28, v24, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e64 v4, s[0:1], -1, v4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v6, vcc, v29, v25, vcc
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v16, 31, v6
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v6, 1, v16
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v5, s[0:1], -1, v5, s[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v11, v19, v17
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], v[6:7]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v6, v16, v12
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v8, s[0:1], -1, v8, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v17, v16, v13
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v30, v16, v14
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v31, v16, v15
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v16, vcc, v20, v6
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v9, s[0:1], -1, v9, s[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v17, vcc, v21, v17, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v20, v4, v8
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v21, v5, v9
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[20:21]
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v24, vcc, v24, v30, vcc
+; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], s[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v25, vcc, v25, v31, vcc
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB1_9
+; GFX942-GISEL-NEXT:  ; %bb.10: ; %Flow
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:  .LBB1_11: ; %Flow11
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[4:5], 1, v[10:11]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[8:9], 1, v[22:23]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v6, 31, v11
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v8, v8, v6
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v4, v18, v4
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v5, v19, v5
+; GFX942-GISEL-NEXT:  .LBB1_12: ; %Flow12
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, v8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %shl = udiv <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
 }
@@ -2456,6 +4148,1005 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v33, s[8:9]
 ; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v8, v33, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-SDAG-LABEL: v_srem_v2i128_vv:
+; GFX942-SDAG:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v18, vcc, 0, v0
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v16, 31, v3
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v20, vcc, 0, v1, vcc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v21, vcc, 0, v2, vcc
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0x7f
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v22, vcc, 0, v3, vcc
+; GFX942-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; GFX942-SDAG-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, v16
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v1, v1, v20, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v18, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v3, v3, v22, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v2, v2, v21, vcc
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v18, vcc, 0, v8
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v20, vcc, 0, v9, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v21, vcc, 0, v10, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v22, vcc, 0, v11, vcc
+; GFX942-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[10:11]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v9, v9, v20, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v8, v8, v18, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v11, v11, v22, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v10, v10, v21, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v21, v9, v11
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v20, v8, v10
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[20:21]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v21, v1, v3
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v20, v0, v2
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v18, v10
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[20:21]
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v18, 32, v18
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v20, v11
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v22, v18, v20
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v18, v8
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v18, 32, v18
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v20, v9
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v18, v18, v20
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[20:21], v[18:19], 0, 64
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v18, v2
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v18, 32, v18
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v22, v20, v22, vcc
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v20, v3
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v24, v18, v20
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v18, v0
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v18, 32, v18
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v20, v1
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v18, v18, v20
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v23, v21, 0, vcc
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[20:21], v[18:19], 0, 64
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v20, v20, v24, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v18, v21, 0, vcc
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v24, vcc, v22, v20
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v25, vcc, v23, v18, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v26, vcc, 0, v19, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v27, vcc, 0, v19, vcc
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[24:25]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v18, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[26:27]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v19, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[26:27]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v18, v19, v18, vcc
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v18, 1, v18
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v18
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v18, 0x7f, v24
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v19, v25, v27
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v18, v18, v26
+; GFX942-SDAG-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v23, v3, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v22, v2, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v21, v1, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v20, v0, 0, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB2_6
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %udiv-bb15
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v18, vcc, 1, v24
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[28:29], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v19, vcc, 0, v25, vcc
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v25, 0x7f, v24
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v20, vcc, 0, v26, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v22, v18, v20
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v21, vcc, 0, v27, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v23, v19, v21
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v26, 64, v25
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[22:23]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[22:23], v25, v[2:3]
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[26:27], v26, v[0:1]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v26, v22, v26
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v22, 63, v24
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v27, v23, v27
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[22:23], v22, v[0:1]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e64 s[0:1], 64, v25
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v25
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[24:25], v25, v[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v23, v23, v27, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v22, v22, v26, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v23, v23, v3, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v22, v22, v2, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v25, 0, v25, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v24, 0, v24, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-SDAG-NEXT:    s_xor_b64 s[2:3], exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB2_5
+; GFX942-SDAG-NEXT:  ; %bb.2: ; %udiv-preheader4
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v28, 64, v18
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[26:27], v18, v[0:1]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[28:29], v28, v[2:3]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v28, v26, v28
+; GFX942-SDAG-NEXT:    v_subrev_u32_e32 v26, 64, v18
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v29, v27, v29
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[26:27], v26, v[2:3]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v18
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[38:39], 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v27, v27, v29, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v26, v26, v28, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v35, v27, v1, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v34, v26, v0, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[26:27], v18, v[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v37, 0, v27, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v36, 0, v26, vcc
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v30, vcc, -1, v8
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[28:29], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v31, vcc, -1, v9, vcc
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v32, vcc, -1, v10, vcc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v27, 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v33, vcc, -1, v11, vcc
+; GFX942-SDAG-NEXT:  .LBB2_3: ; %udiv-do-while3
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v26, 31, v35
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[34:35], 1, v[34:35]
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v50, 31, v23
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[36:37], 1, v[36:37]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v34, v34, v50
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v36, v36, v26
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v26, vcc, v30, v34
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[48:49], 1, v[24:25]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v26, vcc, v31, v35, vcc
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v24, 31, v25
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v26, vcc, v32, v36, vcc
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[22:23], 1, v[22:23]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v26, vcc, v33, v37, vcc
+; GFX942-SDAG-NEXT:    v_or3_b32 v22, v22, v24, v28
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v24, v38, v48
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v38, 31, v26
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v25, v39, v49
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v26, 1, v38
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v39, v38, v11
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v48, v38, v10
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v49, v38, v9
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v38, v38, v8
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v34, vcc, v34, v38
+; GFX942-SDAG-NEXT:    v_or3_b32 v23, v23, 0, v29
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v35, vcc, v35, v49, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v36, vcc, v36, v48, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v37, vcc, v37, v39, vcc
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v18, vcc, -1, v18
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v19, vcc, -1, v19, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v20, vcc, -1, v20, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v38, v18, v20
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v21, vcc, -1, v21, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v39, v19, v21
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[38:39]
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[38:39], v[26:27]
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB2_3
+; GFX942-SDAG-NEXT:  ; %bb.4: ; %Flow13
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:  .LBB2_5: ; %Flow14
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[18:19], 1, v[24:25]
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v24, 31, v25
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[20:21], 1, v[22:23]
+; GFX942-SDAG-NEXT:    v_or3_b32 v23, v21, 0, v29
+; GFX942-SDAG-NEXT:    v_or3_b32 v22, v20, v24, v28
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v21, v27, v19
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v20, v26, v18
+; GFX942-SDAG-NEXT:  .LBB2_6: ; %Flow16
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v24, vcc, 0, v4
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v18, 31, v7
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v26, vcc, 0, v5, vcc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v25, 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v27, vcc, 0, v6, vcc
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0x7f
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v28, vcc, 0, v7, vcc
+; GFX942-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v19, v18
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v5, v5, v26, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v4, v4, v24, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v7, v7, v28, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v6, v6, v27, vcc
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v24, vcc, 0, v12
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v26, vcc, 0, v13, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v27, vcc, 0, v14, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v28, vcc, 0, v15, vcc
+; GFX942-SDAG-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[14:15]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v13, v13, v26, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v12, v12, v24, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v15, v15, v28, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v14, v14, v27, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v27, v13, v15
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v26, v12, v14
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[26:27]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v27, v5, v7
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v26, v4, v6
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v24, v14
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[26:27]
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v24, 32, v24
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v26, v15
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v28, v24, v26
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v24, v12
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v24, 32, v24
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v26, v13
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v24, v24, v26
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[26:27], v[24:25], 0, 64
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v24, v6
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v24, 32, v24
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v28, v26, v28, vcc
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v26, v7
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v30, v24, v26
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v24, v4
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v24, 32, v24
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v26, v5
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v24, v24, v26
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v29, v27, 0, vcc
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[26:27], v[24:25], 0, 64
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v26, v26, v30, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v24, v27, 0, vcc
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v28, vcc, v28, v26
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v29, vcc, v29, v24, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v30, vcc, 0, v25, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v31, vcc, 0, v25, vcc
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[28:29]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v24, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[30:31]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v25, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[30:31]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v24, v25, v24, vcc
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v24, 1, v24
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v24
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v24, 0x7f, v28
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v25, v29, v31
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v24, v24, v30
+; GFX942-SDAG-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[24:25]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v27, v7, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v26, v6, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v25, v5, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v24, v4, 0, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB2_12
+; GFX942-SDAG-NEXT:  ; %bb.7: ; %udiv-bb1
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v24, vcc, 1, v28
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v34, 0x7f, v28
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v25, vcc, 0, v29, vcc
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v29, 64, v34
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v26, vcc, 0, v30, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v30, v24, v26
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v27, vcc, 0, v31, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v31, v25, v27
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[30:31]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[30:31], v34, v[6:7]
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[32:33], v29, v[4:5]
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v28, 63, v28
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v31, v31, v33
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v30, v30, v32
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[28:29], v28, v[4:5]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e64 s[0:1], 64, v34
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v34
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[32:33], 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v29, v29, v31, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v28, v28, v30, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[30:31], v34, v[4:5]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v29, v29, v7, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v28, v28, v6, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v31, 0, v31, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v30, 0, v30, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[34:35], 0
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-SDAG-NEXT:    s_xor_b64 s[2:3], exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB2_11
+; GFX942-SDAG-NEXT:  ; %bb.8: ; %udiv-preheader
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v34, 64, v24
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[32:33], v24, v[4:5]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[34:35], v34, v[6:7]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v34, v32, v34
+; GFX942-SDAG-NEXT:    v_subrev_u32_e32 v32, 64, v24
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v35, v33, v35
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[32:33], v32, v[6:7]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v24
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[52:53], 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v33, v33, v35, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v32, v32, v34, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v49, v33, v5, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v48, v32, v4, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[32:33], v24, v[6:7]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v51, 0, v33, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v50, 0, v32, vcc
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v36, vcc, -1, v12
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[34:35], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v37, vcc, -1, v13, vcc
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v38, vcc, -1, v14, vcc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v33, 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v39, vcc, -1, v15, vcc
+; GFX942-SDAG-NEXT:  .LBB2_9: ; %udiv-do-while
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v32, 31, v49
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[48:49], 1, v[48:49]
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v40, 31, v29
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[50:51], 1, v[50:51]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v48, v48, v40
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v50, v50, v32
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v32, vcc, v36, v48
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[54:55], 1, v[30:31]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v32, vcc, v37, v49, vcc
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v30, 31, v31
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v32, vcc, v38, v50, vcc
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[28:29], 1, v[28:29]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v32, vcc, v39, v51, vcc
+; GFX942-SDAG-NEXT:    v_or3_b32 v28, v28, v30, v34
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v30, v52, v54
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v52, 31, v32
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v31, v53, v55
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v32, 1, v52
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v53, v52, v15
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v54, v52, v14
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v55, v52, v13
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v52, v52, v12
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v48, vcc, v48, v52
+; GFX942-SDAG-NEXT:    v_or3_b32 v29, v29, 0, v35
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v49, vcc, v49, v55, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v50, vcc, v50, v54, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v51, vcc, v51, v53, vcc
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v24, vcc, -1, v24
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v25, vcc, -1, v25, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v26, vcc, -1, v26, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v52, v24, v26
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v27, vcc, -1, v27, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v53, v25, v27
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[52:53]
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[52:53], v[32:33]
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB2_9
+; GFX942-SDAG-NEXT:  ; %bb.10: ; %Flow
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:  .LBB2_11: ; %Flow11
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[24:25], 1, v[30:31]
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v30, 31, v31
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[26:27], 1, v[28:29]
+; GFX942-SDAG-NEXT:    v_or3_b32 v27, v27, 0, v35
+; GFX942-SDAG-NEXT:    v_or3_b32 v26, v26, v30, v34
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v25, v33, v25
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v24, v32, v24
+; GFX942-SDAG-NEXT:  .LBB2_12: ; %Flow12
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v28, v21, v10
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v29, v20, v11
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v20, v10, 0
+; GFX942-SDAG-NEXT:    v_add3_u32 v11, v11, v29, v28
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v22, v8, v[10:11]
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v22, v22, v9
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v23, v23, v8
+; GFX942-SDAG-NEXT:    v_add3_u32 v11, v23, v11, v22
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[22:23], s[0:1], v8, v20, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v29, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v28, v23
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[30:31], s[0:1], v9, v20, v[28:29]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v28, v31
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v31, v29
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[30:31], s[0:1], v8, v21, v[30:31]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v32, v31
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v33, v29
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[32:33], v[28:29], 0, v[32:33]
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v9, v21, v[32:33]
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v22
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[8:9], v[8:9], 0, v[10:11]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v30, vcc
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v10, v25, v14
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v8, vcc
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v11, v24, v15
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v9, vcc
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v24, v14, 0
+; GFX942-SDAG-NEXT:    v_add3_u32 v9, v9, v11, v10
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v26, v12, v[8:9]
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v10, v26, v13
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v11, v27, v12
+; GFX942-SDAG-NEXT:    v_add3_u32 v9, v11, v9, v10
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v12, v24, 0
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v0, v0, v16
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v28, v11
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v17
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v16
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v13, v24, v[28:29]
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v16
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v17, vcc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v28, v15
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v15, v29
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v3, v3, v17
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v16, vcc
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v12, v25, v[14:15]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v17, vcc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, v15
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, v29
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[16:17], v[28:29], 0, v[16:17]
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v13, v25, v[16:17]
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v10
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[8:9], v[12:13], 0, v[8:9]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v14, vcc
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v4, v4, v18
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v8, vcc
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v5, v5, v19
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v9, vcc
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v18
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v6, v6, v18
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v19, vcc
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v7, v7, v19
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v18, vcc
+; GFX942-SDAG-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v19, vcc
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: v_srem_v2i128_vv:
+; GFX942-GISEL:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v34, 31, v3
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v34
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v34
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v18, vcc, v0, v34
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v34
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v19, vcc, v1, v34, vcc
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v21, 31, v11
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v34
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v16, vcc, v2, v34, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v8, v8, v21
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v17, vcc, v3, v34, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v9, v9, v21
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v36, vcc, v8, v21
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v10, v10, v21
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v35, vcc, v9, v21, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v11, v11, v21
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v20, vcc, v10, v21, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v36, v20
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v21, vcc, v11, v21, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v35, v21
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v18, v16
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v19, v17
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[0:1]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v1, v36
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v0, v35
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v1, 32, v1
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v2, v20
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v1, v21
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v0, 64, v0
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[20:21]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v2, v18
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v1, v19
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v3, v16
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v2, v17
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v3, 32, v3
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v1, 64, v1
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v2, v2, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[16:17]
+; GFX942-GISEL-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, v2, v1, s[2:3]
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e64 v8, s[2:3], v0, v1
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], 0x7f
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v9, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v2, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v3, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[8:9], v[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v23, v9, v3
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_lt_u64_e64 s[2:3], 0, v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, v1, v0, s[2:3]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v24, v1, v0
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v0, 0x7f, v8
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v22, v0, v2
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 1, v24
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, v18, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, v19, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v10, v16, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v11, v17, 0, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[22:23]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v22, v24, v22
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v22
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v8
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v9, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v24, vcc, 0x7f, v8
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v10, 64, v24
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v25, 0xffffffc0, v24
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[10:11], v10, v[18:19]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[22:23], v24, v[16:17]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[8:9], v24, v[18:19]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v26, v10, v22
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v27, v11, v23
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[10:11], v25, v[18:19]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v24
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v22, 0, v8, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v23, 0, v9, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v8, v10, v26, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v9, v11, v27, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v24
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v24, v8, v16, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v25, v9, v17, vcc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[4:5]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB2_5
+; GFX942-GISEL-NEXT:  ; %bb.2: ; %udiv-preheader4
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v26, 64, v0
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v28, 0xffffffc0, v0
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[10:11], v0, v[18:19]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[26:27], v26, v[16:17]
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[8:9], v0, v[16:17]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v26, v10, v26
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v27, v11, v27
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[10:11], v28, v[16:17]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v10, v10, v26, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v27, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v30, 0, v8, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v31, 0, v9, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v32, vcc, -1, v36
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v33, vcc, -1, v35, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v28, v10, v18, s[0:1]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v29, v11, v19, s[0:1]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v37, vcc, -1, v20, vcc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[10:11], s[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v38, vcc, -1, v21, vcc
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v27, 0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[4:5]
+; GFX942-GISEL-NEXT:  .LBB2_3: ; %udiv-do-while3
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[30:31], 1, v[30:31]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v26, 31, v29
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[10:11], 1, v[28:29]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v30, v30, v26
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v26, 31, v25
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v10, v10, v26
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[28:29], 1, v[22:23]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[24:25], 1, v[24:25]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v22, 31, v23
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v24, v24, v22
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v22, v8, v28
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v8, vcc, v32, v10
+; GFX942-GISEL-NEXT:    v_add_co_u32_e64 v0, s[0:1], -1, v0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v8, vcc, v33, v11, vcc
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v1, s[0:1], -1, v1, s[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v8, vcc, v37, v30, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v23, v9, v29
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v8, vcc, v38, v31, vcc
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v28, 31, v8
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v26, 1, v28
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], v[26:27]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v26, v28, v36
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v2, s[0:1], -1, v2, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v29, v28, v35
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v39, v28, v20
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v48, v28, v21
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v28, vcc, v10, v26
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v3, s[0:1], -1, v3, s[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v29, vcc, v11, v29, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v10, v0, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v11, v1, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[10:11]
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v30, vcc, v30, v39, vcc
+; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], s[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v31, vcc, v31, v48, vcc
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB2_3
+; GFX942-GISEL-NEXT:  ; %bb.4: ; %Flow13
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:  .LBB2_5: ; %Flow14
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[0:1], 1, v[22:23]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[10:11], 1, v[24:25]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 31, v23
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v10, v10, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX942-GISEL-NEXT:  .LBB2_6: ; %Flow16
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v37, 31, v7
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v15
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v3, v4, v37
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v4, v5, v37
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v9, v14, v2
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v14, vcc, v3, v37
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v5, v6, v37
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v23, v15, v2
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v15, vcc, v4, v37, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v6, v7, v37
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v7, v12, v2
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v12, vcc, v5, v37, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v8, v13, v2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v13, vcc, v6, v37, vcc
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v39, vcc, v7, v2
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v5, v12
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v38, vcc, v8, v2, vcc
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v22, vcc, v9, v2, vcc
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v4, v22
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v23, vcc, v23, v2, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v2, v39, v22
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v3, v38, v23
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v2, v14, v12
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v3, v15, v13
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[2:3]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v3, v39
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v2, v38
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v3, 32, v3
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v2, v2, v3
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v3, v23
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v4, 32, v4
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v2, 64, v2
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v3, v3, v4
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[22:23]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v4, v14
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v4, 32, v4
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[2:3]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v3, v15
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v3, v3, v4
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v4, v13
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v3, 64, v3
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[12:13]
+; GFX942-GISEL-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[2:3]
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e64 v6, s[2:3], v2, v3
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], 0x7f
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v7, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v24, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v25, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[6:7], v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_lt_u64_e64 s[2:3], 0, v[24:25]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[24:25]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[2:3]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v26, v3, v2
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v2, 0x7f, v6
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v4, 1, v26
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v2, v2, v24
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v3, v7, v25
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v4, v14, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v5, v15, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v8, v12, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v9, v13, 0, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v2, v26, v2
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB2_12
+; GFX942-GISEL-NEXT:  ; %bb.7: ; %udiv-bb1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v2, vcc, 1, v6
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v7, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v24, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v25, vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v26, vcc, 0x7f, v6
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v8, 64, v26
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v27, 0xffffffc0, v26
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[8:9], v8, v[14:15]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[24:25], v26, v[12:13]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[6:7], v26, v[14:15]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v28, v8, v24
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v29, v9, v25
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[8:9], v27, v[14:15]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v26
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v24, 0, v6, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v25, 0, v7, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v6, v8, v28, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v7, v9, v29, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v26
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v26, v6, v12, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v27, v7, v13, vcc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB2_11
+; GFX942-GISEL-NEXT:  ; %bb.8: ; %udiv-preheader
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v28, 64, v2
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v30, 0xffffffc0, v2
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[8:9], v2, v[14:15]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[28:29], v28, v[12:13]
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[6:7], v2, v[12:13]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v28, v8, v28
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v29, v9, v29
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[8:9], v30, v[12:13]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v2
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v8, v8, v28, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v9, v9, v29, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v32, 0, v6, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v33, 0, v7, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v48, vcc, -1, v39
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v49, vcc, -1, v38, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v30, v8, v14, s[0:1]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v31, v9, v15, s[0:1]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v50, vcc, -1, v22, vcc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v51, vcc, -1, v23, vcc
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v29, 0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
+; GFX942-GISEL-NEXT:  .LBB2_9: ; %udiv-do-while
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[32:33], 1, v[32:33]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v28, 31, v31
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[8:9], 1, v[30:31]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v32, v32, v28
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v28, 31, v27
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v8, v8, v28
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[30:31], 1, v[24:25]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[26:27], 1, v[26:27]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v24, 31, v25
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v26, v26, v24
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v24, v6, v30
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v6, vcc, v48, v8
+; GFX942-GISEL-NEXT:    v_add_co_u32_e64 v2, s[0:1], -1, v2
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v6, vcc, v49, v9, vcc
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v3, s[0:1], -1, v3, s[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v6, vcc, v50, v32, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v25, v7, v31
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v6, vcc, v51, v33, vcc
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v30, 31, v6
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v28, 1, v30
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[6:7], v[28:29]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v28, v30, v39
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v4, s[0:1], -1, v4, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v31, v30, v38
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v52, v30, v22
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v53, v30, v23
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v30, vcc, v8, v28
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v5, s[0:1], -1, v5, s[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v31, vcc, v9, v31, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v8, v2, v4
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v9, v3, v5
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[8:9]
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v32, vcc, v32, v52, vcc
+; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], s[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v33, vcc, v33, v53, vcc
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB2_9
+; GFX942-GISEL-NEXT:  ; %bb.10: ; %Flow
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:  .LBB2_11: ; %Flow11
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[2:3], 1, v[24:25]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[8:9], 1, v[26:27]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 31, v25
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v8, v8, v4
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v4, v6, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v5, v7, v3
+; GFX942-GISEL-NEXT:  .LBB2_12: ; %Flow12
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[0:1], v36, v1, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v36, v10, 0
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v26, v36, v11
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v27, v35, v10
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[10:11], vcc, v35, v0, v[24:25]
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v36, v0, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v35, v1, v[6:7]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v24, vcc, v26, v27, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v20, v0, v[6:7]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v3, vcc, v3, v10
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v11, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v20, v1, v[24:25]
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v21, v0, v[10:11]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v0, vcc, v7, v0, vcc
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v1, vcc, v18, v2
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v1, v1, v34
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v2, vcc, v19, v3, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v2, v2, v34
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v16, v6, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v34
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v0, vcc, v17, v0, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v6, v0, v34
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v1, v34
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v39, v5, 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v34, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v39, v8, 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v2, vcc, v3, v34, vcc
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v18, v39, v9
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v6, v34, vcc
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v19, v38, v8
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[8:9], vcc, v38, v4, v[16:17]
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v39, v4, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v38, v5, v[10:11]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v16, vcc, v18, v19, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v22, v4, v[10:11]
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v7, vcc, v7, v8
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v10, vcc, v10, v9, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v22, v5, v[16:17]
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v23, v4, v[8:9]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v4, vcc, v11, v4, vcc
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v5, vcc, v14, v6
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v5, v5, v37
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v6, vcc, v15, v7, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v6, v6, v37
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v7, vcc, v12, v10, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v7, v7, v37
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v4, vcc, v13, v4, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v8, v4, v37
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v4, vcc, v5, v37
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v5, vcc, v6, v37, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v6, vcc, v7, v37, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v7, vcc, v8, v37, vcc
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %shl = srem <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
 }
@@ -3247,6 +5938,863 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GISEL-NEXT:    v_subb_u32_e64 v6, vcc, v6, v13, s[4:5]
 ; GISEL-NEXT:    v_subb_u32_e32 v7, vcc, v7, v11, vcc
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-SDAG-LABEL: v_urem_v2i128_vv:
+; GFX942-SDAG:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v17, v9, v11
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v16, v8, v10
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v17, v1, v3
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v16, v0, v2
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[16:17]
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v16, v10
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v16, 32, v16
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v17, v11
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v20, v16, v17
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v16, v8
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v16, 32, v16
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v18, v9
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v16, v16, v18
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[18:19], v[16:17], 0, 64
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v16, v2
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v16, 32, v16
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v20, v18, v20, vcc
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v18, v3
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v22, v16, v18
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v16, v0
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v16, 32, v16
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v18, v1
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v16, v16, v18
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v21, v19, 0, vcc
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[18:19], v[16:17], 0, 64
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0x7f
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v18, v18, v22, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v16, v19, 0, vcc
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v20, vcc, v20, v18
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v21, vcc, v21, v16, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v22, vcc, 0, v17, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v23, vcc, 0, v17, vcc
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[20:21]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v16, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[22:23]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v17, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[22:23]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v16, v17, v16, vcc
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v16, 1, v16
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v16
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v16, 0x7f, v20
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v17, v21, v23
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v16, v16, v22
+; GFX942-SDAG-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v19, v3, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v18, v2, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v17, v1, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v16, v0, 0, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB3_6
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %udiv-bb15
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v16, vcc, 1, v20
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v26, 0x7f, v20
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v17, vcc, 0, v21, vcc
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v21, 64, v26
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v18, vcc, 0, v22, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v22, v16, v18
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v19, vcc, 0, v23, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v23, v17, v19
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[22:23]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[22:23], v26, v[2:3]
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[24:25], v21, v[0:1]
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v20, 63, v20
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v23, v23, v25
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v22, v22, v24
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[20:21], v20, v[0:1]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e64 s[0:1], 64, v26
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v26
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[24:25], 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v21, v21, v23, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v20, v20, v22, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[22:23], v26, v[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v21, v21, v3, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v20, v20, v2, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v23, 0, v23, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v22, 0, v22, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-SDAG-NEXT:    s_xor_b64 s[2:3], exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB3_5
+; GFX942-SDAG-NEXT:  ; %bb.2: ; %udiv-preheader4
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v26, 64, v16
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[24:25], v16, v[0:1]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[26:27], v26, v[2:3]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v26, v24, v26
+; GFX942-SDAG-NEXT:    v_subrev_u32_e32 v24, 64, v16
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v27, v25, v27
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[24:25], v24, v[2:3]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v16
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[36:37], 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v25, v25, v27, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v24, v24, v26, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v33, v25, v1, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v32, v24, v0, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[24:25], v16, v[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v35, 0, v25, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v34, 0, v24, vcc
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v28, vcc, -1, v8
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[26:27], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v29, vcc, -1, v9, vcc
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v30, vcc, -1, v10, vcc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v25, 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v31, vcc, -1, v11, vcc
+; GFX942-SDAG-NEXT:  .LBB3_3: ; %udiv-do-while3
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v24, 31, v33
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[32:33], 1, v[32:33]
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v48, 31, v21
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[34:35], 1, v[34:35]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v32, v32, v48
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v34, v34, v24
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v24, vcc, v28, v32
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[38:39], 1, v[22:23]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v24, vcc, v29, v33, vcc
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v22, 31, v23
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v24, vcc, v30, v34, vcc
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[20:21], 1, v[20:21]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v24, vcc, v31, v35, vcc
+; GFX942-SDAG-NEXT:    v_or3_b32 v20, v20, v22, v26
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v22, v36, v38
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v36, 31, v24
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v23, v37, v39
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v24, 1, v36
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v37, v36, v11
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v38, v36, v10
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v39, v36, v9
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v36, v36, v8
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v32, vcc, v32, v36
+; GFX942-SDAG-NEXT:    v_or3_b32 v21, v21, 0, v27
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v33, vcc, v33, v39, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v34, vcc, v34, v38, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v35, vcc, v35, v37, vcc
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v16, vcc, -1, v16
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v17, vcc, -1, v17, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v18, vcc, -1, v18, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v36, v16, v18
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v19, vcc, -1, v19, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v37, v17, v19
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[36:37]
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[36:37], v[24:25]
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB3_3
+; GFX942-SDAG-NEXT:  ; %bb.4: ; %Flow13
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:  .LBB3_5: ; %Flow14
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[16:17], 1, v[22:23]
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v22, 31, v23
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[18:19], 1, v[20:21]
+; GFX942-SDAG-NEXT:    v_or3_b32 v19, v19, 0, v27
+; GFX942-SDAG-NEXT:    v_or3_b32 v18, v18, v22, v26
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v17, v25, v17
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v16, v24, v16
+; GFX942-SDAG-NEXT:  .LBB3_6: ; %Flow16
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v21, v13, v15
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v20, v12, v14
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[20:21]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v21, v5, v7
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v20, v4, v6
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[20:21]
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v20, v14
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v20, 32, v20
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v21, v15
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v24, v20, v21
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v20, v12
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v20, 32, v20
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v22, v13
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v21, 0
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v20, v20, v22
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[22:23], v[20:21], 0, 64
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v20, v6
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v20, 32, v20
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v24, v22, v24, vcc
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v22, v7
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v26, v20, v22
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v20, v4
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v20, 32, v20
+; GFX942-SDAG-NEXT:    v_ffbh_u32_e32 v22, v5
+; GFX942-SDAG-NEXT:    v_min_u32_e32 v20, v20, v22
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v25, v23, 0, vcc
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[22:23], v[20:21], 0, 64
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0x7f
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v22, v22, v26, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v20, v23, 0, vcc
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v24, vcc, v24, v22
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v25, vcc, v25, v20, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v26, vcc, 0, v21, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v27, vcc, 0, v21, vcc
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[24:25]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v20, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[26:27]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v21, 0, 1, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[26:27]
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v20, v21, v20, vcc
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v20, 1, v20
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v20
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v20, 0x7f, v24
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v21, v25, v27
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v20, v20, v26
+; GFX942-SDAG-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[20:21]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v23, v7, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v22, v6, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v21, v5, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v20, v4, 0, s[0:1]
+; GFX942-SDAG-NEXT:    s_and_b64 s[0:1], s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB3_12
+; GFX942-SDAG-NEXT:  ; %bb.7: ; %udiv-bb1
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v20, vcc, 1, v24
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v30, 0x7f, v24
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v21, vcc, 0, v25, vcc
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v25, 64, v30
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v22, vcc, 0, v26, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v26, v20, v22
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v23, vcc, 0, v27, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v27, v21, v23
+; GFX942-SDAG-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[26:27]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[26:27], v30, v[6:7]
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[28:29], v25, v[4:5]
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v24, 63, v24
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v27, v27, v29
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v26, v26, v28
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[24:25], v24, v[4:5]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e64 s[0:1], 64, v30
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v30
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[28:29], 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v25, v25, v27, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v24, v24, v26, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[26:27], v30, v[4:5]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v25, v25, v7, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v24, v24, v6, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v27, 0, v27, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v26, 0, v26, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[30:31], 0
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-SDAG-NEXT:    s_xor_b64 s[2:3], exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB3_11
+; GFX942-SDAG-NEXT:  ; %bb.8: ; %udiv-preheader
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v30, 64, v20
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[28:29], v20, v[4:5]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[30:31], v30, v[6:7]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v30, v28, v30
+; GFX942-SDAG-NEXT:    v_subrev_u32_e32 v28, 64, v20
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v31, v29, v31
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[28:29], v28, v[6:7]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v20
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v20
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[48:49], 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v29, v29, v31, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v28, v28, v30, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v37, v29, v5, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v36, v28, v4, s[0:1]
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[28:29], v20, v[6:7]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v39, 0, v29, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v38, 0, v28, vcc
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v32, vcc, -1, v12
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[30:31], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v33, vcc, -1, v13, vcc
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v34, vcc, -1, v14, vcc
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v29, 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v35, vcc, -1, v15, vcc
+; GFX942-SDAG-NEXT:  .LBB3_9: ; %udiv-do-while
+; GFX942-SDAG-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v28, 31, v37
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[36:37], 1, v[36:37]
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v52, 31, v25
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[38:39], 1, v[38:39]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v36, v36, v52
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v38, v38, v28
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v28, vcc, v32, v36
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[50:51], 1, v[26:27]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v28, vcc, v33, v37, vcc
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v26, 31, v27
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v28, vcc, v34, v38, vcc
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[24:25], 1, v[24:25]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v28, vcc, v35, v39, vcc
+; GFX942-SDAG-NEXT:    v_or3_b32 v24, v24, v26, v30
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v26, v48, v50
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v48, 31, v28
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v27, v49, v51
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v28, 1, v48
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v49, v48, v15
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v50, v48, v14
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v51, v48, v13
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v48, v48, v12
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v36, vcc, v36, v48
+; GFX942-SDAG-NEXT:    v_or3_b32 v25, v25, 0, v31
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v37, vcc, v37, v51, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v38, vcc, v38, v50, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v39, vcc, v39, v49, vcc
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v20, vcc, -1, v20
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v21, vcc, -1, v21, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v22, vcc, -1, v22, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v48, v20, v22
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v23, vcc
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v49, v21, v23
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[48:49]
+; GFX942-SDAG-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[48:49], v[28:29]
+; GFX942-SDAG-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:    s_cbranch_execnz .LBB3_9
+; GFX942-SDAG-NEXT:  ; %bb.10: ; %Flow
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-SDAG-NEXT:  .LBB3_11: ; %Flow11
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[20:21], 1, v[26:27]
+; GFX942-SDAG-NEXT:    v_lshrrev_b32_e32 v26, 31, v27
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[22:23], 1, v[24:25]
+; GFX942-SDAG-NEXT:    v_or3_b32 v23, v23, 0, v31
+; GFX942-SDAG-NEXT:    v_or3_b32 v22, v22, v26, v30
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v21, v29, v21
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v20, v28, v20
+; GFX942-SDAG-NEXT:  .LBB3_12: ; %Flow12
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v24, v17, v10
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v25, v16, v11
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v16, v10, 0
+; GFX942-SDAG-NEXT:    v_add3_u32 v11, v11, v25, v24
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v18, v8, v[10:11]
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v18, v18, v9
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v19, v19, v8
+; GFX942-SDAG-NEXT:    v_add3_u32 v11, v19, v11, v18
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[18:19], s[0:1], v8, v16, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v25, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v24, v19
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[26:27], s[0:1], v9, v16, v[24:25]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v24, v27
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v27, v25
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[26:27], s[0:1], v8, v17, v[26:27]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v28, v27
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v29, v25
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[28:29], v[24:25], 0, v[28:29]
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v9, v17, v[28:29]
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v18
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[8:9], v[8:9], 0, v[10:11]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v26, vcc
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v10, v21, v14
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v8, vcc
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v11, v20, v15
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v9, vcc
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v20, v14, 0
+; GFX942-SDAG-NEXT:    v_add3_u32 v9, v9, v11, v10
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v22, v12, v[8:9]
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v10, v22, v13
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v11, v23, v12
+; GFX942-SDAG-NEXT:    v_add3_u32 v9, v11, v9, v10
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v12, v20, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v24, v11
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v13, v20, v[24:25]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v24, v15
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v15, v25
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v12, v21, v[14:15]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v16, v15
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v17, v25
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[16:17], v[24:25], 0, v[16:17]
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v13, v21, v[16:17]
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v10
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[8:9], v[12:13], 0, v[8:9]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v14, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v8, vcc
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v9, vcc
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: v_urem_v2i128_vv:
+; GFX942-GISEL:       ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v16, v8, v10
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v17, v9, v11
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v16, v0, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v17, v1, v3
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[16:17]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v17, v8
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v16, v9
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v17, 32, v17
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v18, v10
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v16, v16, v17
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v17, v11
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v18, 32, v18
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v16, 64, v16
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v17, v17, v18
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[10:11]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v18, v0
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v18, 32, v18
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v16, v17, v16, s[2:3]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v17, v1
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v19, v2
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v17, v17, v18
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v18, v3
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v19, 32, v19
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v17, 64, v17
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v18, v18, v19
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[2:3]
+; GFX942-GISEL-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v17, v18, v17, s[2:3]
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e64 v20, s[2:3], v16, v17
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[16:17], 0x7f
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v21, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v18, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v19, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[20:21], v[16:17]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v23, v21, v19
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v16, 0, 1, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_lt_u64_e64 s[2:3], 0, v[18:19]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[18:19]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v16, v17, v16, s[2:3]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v17, 0, 1, s[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v24, v17, v16
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v16, 0x7f, v20
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v22, v16, v18
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v16, 1, v24
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v16, v0, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v17, v1, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v26, v2, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v27, v3, 0, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[22:23]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v22, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v22, v24, v22
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v22, 1, v22
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v22
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %udiv-bb15
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v16, vcc, 1, v20
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v17, vcc, 0, v21, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v18, vcc, 0, v18, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v19, vcc, 0, v19, vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v26, vcc, 0x7f, v20
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v22, 64, v26
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v27, 0xffffffc0, v26
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[22:23], v22, v[0:1]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[24:25], v26, v[2:3]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[20:21], v26, v[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v28, v22, v24
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v29, v23, v25
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[22:23], v27, v[0:1]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v26
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v24, 0, v20, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v25, 0, v21, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v20, v22, v28, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v21, v23, v29, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v26
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v26, v20, v2, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v27, v21, v3, vcc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[4:5]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB3_5
+; GFX942-GISEL-NEXT:  ; %bb.2: ; %udiv-preheader4
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v28, 64, v16
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v30, 0xffffffc0, v16
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[22:23], v16, v[0:1]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[28:29], v28, v[2:3]
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[20:21], v16, v[2:3]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v28, v22, v28
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v29, v23, v29
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[22:23], v30, v[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v16
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v22, v22, v28, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v23, v23, v29, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v32, 0, v20, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v33, 0, v21, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v34, vcc, -1, v8
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v35, vcc, -1, v9, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v30, v22, v0, s[0:1]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v31, v23, v1, s[0:1]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v36, vcc, -1, v10, vcc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v37, vcc, -1, v11, vcc
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v29, 0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], s[4:5]
+; GFX942-GISEL-NEXT:  .LBB3_3: ; %udiv-do-while3
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[32:33], 1, v[32:33]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v28, 31, v31
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[22:23], 1, v[30:31]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v32, v32, v28
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v28, 31, v27
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v22, v22, v28
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[30:31], 1, v[24:25]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[26:27], 1, v[26:27]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v24, 31, v25
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v26, v26, v24
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v24, v20, v30
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v20, vcc, v34, v22
+; GFX942-GISEL-NEXT:    v_add_co_u32_e64 v16, s[0:1], -1, v16
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v20, vcc, v35, v23, vcc
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v17, s[0:1], -1, v17, s[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v20, vcc, v36, v32, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v25, v21, v31
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v20, vcc, v37, v33, vcc
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v30, 31, v20
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v28, 1, v30
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[20:21], v[28:29]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v28, v30, v8
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v18, s[0:1], -1, v18, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v31, v30, v9
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v38, v30, v10
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v39, v30, v11
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v30, vcc, v22, v28
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v19, s[0:1], -1, v19, s[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v31, vcc, v23, v31, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v22, v16, v18
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v23, v17, v19
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[22:23]
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v32, vcc, v32, v38, vcc
+; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], s[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v33, vcc, v33, v39, vcc
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB3_3
+; GFX942-GISEL-NEXT:  ; %bb.4: ; %Flow13
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:  .LBB3_5: ; %Flow14
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[16:17], 1, v[24:25]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[26:27], 1, v[26:27]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v18, 31, v25
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v26, v26, v18
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v16, v20, v16
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v17, v21, v17
+; GFX942-GISEL-NEXT:  .LBB3_6: ; %Flow16
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v18, v12, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v19, v13, v15
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v18, v4, v6
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v19, v5, v7
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[18:19]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v19, v12
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v18, v13
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v19, 32, v19
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v20, v14
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v18, v18, v19
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v19, v15
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v20, 32, v20
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v18, 64, v18
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v19, v19, v20
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[14:15]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v20, v4
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v20, 32, v20
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v18, v19, v18, s[2:3]
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v19, v5
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v21, v6
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v19, v19, v20
+; GFX942-GISEL-NEXT:    v_ffbh_u32_e32 v20, v7
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v21, 32, v21
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v19, 64, v19
+; GFX942-GISEL-NEXT:    v_min_u32_e32 v20, v20, v21
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[6:7]
+; GFX942-GISEL-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v19, v20, v19, s[2:3]
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e64 v22, s[2:3], v18, v19
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[18:19], 0x7f
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v23, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v24, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e64 v25, s[2:3], 0, 0, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u64_e64 s[2:3], v[22:23], v[18:19]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v29, v23, v25
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v18, 0, 1, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_lt_u64_e64 s[2:3], 0, v[24:25]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[24:25]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v18, v19, v18, s[2:3]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v19, 0, 1, s[0:1]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v30, v19, v18
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v18, 0x7f, v22
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v28, v18, v24
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v18, 1, v30
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v18
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v18, v4, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v19, v5, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v20, v6, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v21, v7, 0, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[28:29]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v28, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v28, v30, v28
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v28, 1, v28
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v28
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB3_12
+; GFX942-GISEL-NEXT:  ; %bb.7: ; %udiv-bb1
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v18, vcc, 1, v22
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v19, vcc, 0, v23, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v20, vcc, 0, v24, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v21, vcc, 0, v25, vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[0:1], vcc, -1
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v30, vcc, 0x7f, v22
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v24, 64, v30
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v31, 0xffffffc0, v30
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[24:25], v24, v[4:5]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[28:29], v30, v[6:7]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[22:23], v30, v[4:5]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v32, v24, v28
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v33, v25, v29
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[24:25], v31, v[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v30
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v28, 0, v22, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v29, 0, v23, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v22, v24, v32, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v23, v25, v33, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v30
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v30, v22, v6, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v31, v23, v7, vcc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[6:7]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[4:5]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-GISEL-NEXT:    s_xor_b64 s[8:9], exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB3_11
+; GFX942-GISEL-NEXT:  ; %bb.8: ; %udiv-preheader
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v32, 64, v18
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v34, 0xffffffc0, v18
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[24:25], v18, v[4:5]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[32:33], v32, v[6:7]
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[22:23], v18, v[6:7]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v32, v24, v32
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v33, v25, v33
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[24:25], v34, v[6:7]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v18
+; GFX942-GISEL-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v18
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v24, v24, v32, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v25, v25, v33, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v36, 0, v22, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v37, 0, v23, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v38, vcc, -1, v12
+; GFX942-GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v39, vcc, -1, v13, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v34, v24, v4, s[0:1]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v35, v25, v5, s[0:1]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v48, vcc, -1, v14, vcc
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[24:25], s[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v49, vcc, -1, v15, vcc
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v33, 0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[22:23], s[4:5]
+; GFX942-GISEL-NEXT:  .LBB3_9: ; %udiv-do-while
+; GFX942-GISEL-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[36:37], 1, v[36:37]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v32, 31, v35
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[24:25], 1, v[34:35]
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v36, v36, v32
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v32, 31, v31
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v24, v24, v32
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[34:35], 1, v[28:29]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[30:31], 1, v[30:31]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v28, 31, v29
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v30, v30, v28
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v28, v22, v34
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v22, vcc, v38, v24
+; GFX942-GISEL-NEXT:    v_add_co_u32_e64 v18, s[0:1], -1, v18
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v22, vcc, v39, v25, vcc
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v19, s[0:1], -1, v19, s[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v22, vcc, v48, v36, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v29, v23, v35
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v22, vcc, v49, v37, vcc
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v34, 31, v22
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v32, 1, v34
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[22:23], v[32:33]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v32, v34, v12
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v20, s[0:1], -1, v20, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v35, v34, v13
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v50, v34, v14
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v51, v34, v15
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v34, vcc, v24, v32
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v21, s[0:1], -1, v21, s[0:1]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v35, vcc, v25, v35, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v24, v18, v20
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v25, v19, v21
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[24:25]
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v36, vcc, v36, v50, vcc
+; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], s[0:1], s[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v37, vcc, v37, v51, vcc
+; GFX942-GISEL-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execnz .LBB3_9
+; GFX942-GISEL-NEXT:  ; %bb.10: ; %Flow
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:  .LBB3_11: ; %Flow11
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[18:19], 1, v[28:29]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[20:21], 1, v[30:31]
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v24, 31, v29
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v20, v20, v24
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v18, v22, v18
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v19, v23, v19
+; GFX942-GISEL-NEXT:  .LBB3_12: ; %Flow12
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[0:1], v8, v26, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[28:29], s[0:1], v8, v17, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[22:23], s[0:1], v8, v16, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[0:1], v9, v17, v[24:25]
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v27, v8, v27
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v26, v9, v26
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[8:9], vcc, v9, v16, v[28:29]
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[24:25], s[0:1], v10, v16, v[24:25]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v26, vcc, v27, v26, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v23, vcc, v23, v8
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v21, v12, v21
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v24, vcc, v24, v9, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v10, v17, v[26:27]
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v11, v16, v[8:9]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v8, vcc, v25, v8, vcc
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v22
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v12, v20, 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v23, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v12, v19, 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v24, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v13, v19, v[10:11]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v8, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v12, v18, 0
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v20, v13, v20
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[12:13], vcc, v13, v18, v[16:17]
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v14, v18, v[10:11]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v16, vcc, v21, v20, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v9, vcc, v9, v12
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v10, vcc, v10, v13, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v14, v19, v[16:17]
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v15, v18, v[12:13]
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v11, vcc, v11, v12, vcc
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v8
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v9, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v10, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v11, vcc
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %shl = urem <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
 }
diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
index c69b0cce3d208..5fcdf741e3295 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=fiji < %s | FileCheck -check-prefixes=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942 %s
 
 define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-LABEL: float4_extelt:
@@ -20,6 +21,24 @@ define amdgpu_kernel void @float4_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: float4_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s6, 2
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[2:3]
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s6, 3
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 2.0, v1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 4.0, v1, vcc
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <4 x float> <float 0.0, float 1.0, float 2.0, float 4.0>, i32 %sel
   store float %ext, ptr addrspace(1) %out
@@ -43,6 +62,22 @@ define amdgpu_kernel void @int4_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: int4_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX942-NEXT:    s_cselect_b32 s3, 1, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 2
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 2
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 3
+; GFX942-NEXT:    s_cselect_b32 s2, s3, 4
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 4>, i32 %sel
   store i32 %ext, ptr addrspace(1) %out
@@ -72,6 +107,28 @@ define amdgpu_kernel void @double4_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: double4_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0x3ff028f5
+; GFX942-NEXT:    s_mov_b32 s4, 0xc28f5c29
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0x3f847ae1
+; GFX942-NEXT:    s_cselect_b32 s4, s4, 0x47ae147b
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 2
+; GFX942-NEXT:    s_cselect_b32 s4, 0xe147ae14, s4
+; GFX942-NEXT:    s_cselect_b32 s3, 0x4000147a, s3
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 3
+; GFX942-NEXT:    s_cselect_b32 s2, 0x40100a3d, s3
+; GFX942-NEXT:    s_cselect_b32 s3, 0x70a3d70a, s4
+; GFX942-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <4 x double> <double 0.01, double 1.01, double 2.01, double 4.01>, i32 %sel
   store double %ext, ptr addrspace(1) %out
@@ -109,6 +166,36 @@ define amdgpu_kernel void @double5_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: double5_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s2, 0x3ff028f5
+; GFX942-NEXT:    s_mov_b32 s3, 0xc28f5c29
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX942-NEXT:    s_cselect_b32 s2, s2, 0x3f847ae1
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0x47ae147b
+; GFX942-NEXT:    s_cmp_eq_u32 s6, 2
+; GFX942-NEXT:    s_cselect_b32 s8, 0xe147ae14, s3
+; GFX942-NEXT:    s_cselect_b32 s7, 0x4000147a, s2
+; GFX942-NEXT:    s_cmp_eq_u32 s6, 3
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_and_b64 s[4:5], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s9, 0x40100a3d, s7
+; GFX942-NEXT:    s_cmp_eq_u32 s6, 4
+; GFX942-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX942-NEXT:    s_and_b64 s[6:7], s[4:5], exec
+; GFX942-NEXT:    s_cselect_b32 s6, 0x40140a3d, s9
+; GFX942-NEXT:    s_or_b64 s[2:3], s[4:5], s[2:3]
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, 0x70a3d70a, s8
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <5 x double> <double 0.01, double 1.01, double 2.01, double 4.01, double 5.01>, i32 %sel
   store double %ext, ptr addrspace(1) %out
@@ -130,6 +217,20 @@ define amdgpu_kernel void @half4_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_short v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: half4_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s2, 0x40003c00
+; GFX942-NEXT:    s_mov_b32 s3, 0x44004200
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s4, s6, 4
+; GFX942-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <4 x half> <half 1.0, half 2.0, half 3.0, half 4.0>, i32 %sel
   store half %ext, ptr addrspace(1) %out
@@ -149,6 +250,18 @@ define amdgpu_kernel void @float2_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: float2_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, 0, 1.0, s[2:3]
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <2 x float> <float 0.0, float 1.0>, i32 %sel
   store float %ext, ptr addrspace(1) %out
@@ -172,6 +285,22 @@ define amdgpu_kernel void @double2_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s0
 ; GCN-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: double2_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0x3ff028f5
+; GFX942-NEXT:    s_mov_b32 s4, 0xc28f5c29
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX942-NEXT:    s_cselect_b32 s2, s3, 0x3f847ae1
+; GFX942-NEXT:    s_cselect_b32 s3, s4, 0x47ae147b
+; GFX942-NEXT:    v_mov_b32_e32 v2, s3
+; GFX942-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <2 x double> <double 0.01, double 1.01>, i32 %sel
   store double %ext, ptr addrspace(1) %out
@@ -217,6 +346,44 @@ define amdgpu_kernel void @half8_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_short v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: half8_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x3c00
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x4000
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 2
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x4200
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 3
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x4400
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 4
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x4500
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 5
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x4600
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 6
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x4700
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 7
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x4800
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x half> <half 1.0, half 2.0, half 3.0, half 4.0, half 5.0, half 6.0, half 7.0, half 8.0>, i32 %sel
   store half %ext, ptr addrspace(1) %out
@@ -248,6 +415,30 @@ define amdgpu_kernel void @short8_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_short v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: short8_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX942-NEXT:    s_cselect_b32 s3, 2, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 2
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 3
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 3
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 4
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 4
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 5
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 5
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 6
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 6
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 7
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 7
+; GFX942-NEXT:    s_cselect_b32 s2, s3, 8
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>, i32 %sel
   store i16 %ext, ptr addrspace(1) %out
@@ -274,6 +465,41 @@ define amdgpu_kernel void @float8_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: float8_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s6, 1
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s6, 2
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, s[2:3]
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s6, 3
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s6, 4
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 4.0, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x40a00000
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s6, 5
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x40c00000
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s6, 6
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x40e00000
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s6, 7
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41000000
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, i32 %sel
   store float %ext, ptr addrspace(1) %out
@@ -325,6 +551,44 @@ define amdgpu_kernel void @double8_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s17
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[15:16]
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: double8_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s18, s[4:5], 0x2c
+; GFX942-NEXT:    s_mov_b32 s1, 0x3ff00000
+; GFX942-NEXT:    s_mov_b32 s0, 0
+; GFX942-NEXT:    s_mov_b32 s15, 0x40200000
+; GFX942-NEXT:    s_mov_b32 s13, 0x401c0000
+; GFX942-NEXT:    s_mov_b32 s11, 0x40180000
+; GFX942-NEXT:    s_mov_b32 s9, 0x40140000
+; GFX942-NEXT:    s_mov_b32 s7, 0x40100000
+; GFX942-NEXT:    s_mov_b32 s5, 0x40080000
+; GFX942-NEXT:    s_mov_b32 s3, 2.0
+; GFX942-NEXT:    s_mov_b32 s2, s0
+; GFX942-NEXT:    s_mov_b32 s4, s0
+; GFX942-NEXT:    s_mov_b32 s6, s0
+; GFX942-NEXT:    s_mov_b32 s8, s0
+; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b32 s14, s0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s18, s18, 1
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v18, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-NEXT:    s_set_gpr_idx_on s18, gpr_idx(SRC0)
+; GFX942-NEXT:    v_mov_b32_e32 v17, v1
+; GFX942-NEXT:    v_mov_b32_e32 v16, v0
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    global_store_dwordx2 v18, v[16:17], s[16:17]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0>, i32 %sel
   store double %ext, ptr addrspace(1) %out
@@ -374,6 +638,42 @@ define amdgpu_kernel void @double7_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s15
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[15:16]
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: double7_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx2 s[14:15], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s16, s[4:5], 0x2c
+; GFX942-NEXT:    s_mov_b32 s1, 0x3ff00000
+; GFX942-NEXT:    s_mov_b32 s0, 0
+; GFX942-NEXT:    s_mov_b32 s13, 0x401c0000
+; GFX942-NEXT:    s_mov_b32 s11, 0x40180000
+; GFX942-NEXT:    s_mov_b32 s9, 0x40140000
+; GFX942-NEXT:    s_mov_b32 s7, 0x40100000
+; GFX942-NEXT:    s_mov_b32 s5, 0x40080000
+; GFX942-NEXT:    s_mov_b32 s3, 2.0
+; GFX942-NEXT:    s_mov_b32 s2, s0
+; GFX942-NEXT:    s_mov_b32 s4, s0
+; GFX942-NEXT:    s_mov_b32 s6, s0
+; GFX942-NEXT:    s_mov_b32 s8, s0
+; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s16, s16, 1
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v18, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-NEXT:    s_set_gpr_idx_on s16, gpr_idx(SRC0)
+; GFX942-NEXT:    v_mov_b32_e32 v17, v1
+; GFX942-NEXT:    v_mov_b32_e32 v16, v0
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    global_store_dwordx2 v18, v[16:17], s[14:15]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <7 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0>, i32 %sel
   store double %ext, ptr addrspace(1) %out
@@ -408,6 +708,34 @@ define amdgpu_kernel void @float16_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: float16_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GFX942-NEXT:    v_mov_b32_e32 v3, 4.0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x40a00000
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0x40e00000
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0x41000000
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0x41100000
+; GFX942-NEXT:    v_mov_b32_e32 v9, 0x41200000
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0x41300000
+; GFX942-NEXT:    v_mov_b32_e32 v11, 0x41400000
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0x41500000
+; GFX942-NEXT:    v_mov_b32_e32 v13, 0x41600000
+; GFX942-NEXT:    v_mov_b32_e32 v14, 0x41700000
+; GFX942-NEXT:    v_mov_b32_e32 v15, 0x41800000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, v0
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    global_store_dword v16, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <16 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0>, i32 %sel
   store float %ext, ptr addrspace(1) %out
@@ -489,6 +817,66 @@ define amdgpu_kernel void @double15_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[31:32]
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: double15_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx2 s[30:31], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s33, s[4:5], 0x2c
+; GFX942-NEXT:    s_mov_b32 s1, 0x3ff00000
+; GFX942-NEXT:    s_mov_b32 s0, 0
+; GFX942-NEXT:    s_mov_b32 s29, 0x402e0000
+; GFX942-NEXT:    s_mov_b32 s27, 0x402c0000
+; GFX942-NEXT:    s_mov_b32 s25, 0x402a0000
+; GFX942-NEXT:    s_mov_b32 s23, 0x40280000
+; GFX942-NEXT:    s_mov_b32 s21, 0x40260000
+; GFX942-NEXT:    s_mov_b32 s19, 0x40240000
+; GFX942-NEXT:    s_mov_b32 s17, 0x40220000
+; GFX942-NEXT:    s_mov_b32 s15, 0x40200000
+; GFX942-NEXT:    s_mov_b32 s13, 0x401c0000
+; GFX942-NEXT:    s_mov_b32 s11, 0x40180000
+; GFX942-NEXT:    s_mov_b32 s9, 0x40140000
+; GFX942-NEXT:    s_mov_b32 s7, 0x40100000
+; GFX942-NEXT:    s_mov_b32 s5, 0x40080000
+; GFX942-NEXT:    s_mov_b32 s3, 2.0
+; GFX942-NEXT:    s_mov_b32 s2, s0
+; GFX942-NEXT:    s_mov_b32 s4, s0
+; GFX942-NEXT:    s_mov_b32 s6, s0
+; GFX942-NEXT:    s_mov_b32 s8, s0
+; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b32 s14, s0
+; GFX942-NEXT:    s_mov_b32 s16, s0
+; GFX942-NEXT:    s_mov_b32 s18, s0
+; GFX942-NEXT:    s_mov_b32 s20, s0
+; GFX942-NEXT:    s_mov_b32 s22, s0
+; GFX942-NEXT:    s_mov_b32 s24, s0
+; GFX942-NEXT:    s_mov_b32 s26, s0
+; GFX942-NEXT:    s_mov_b32 s28, s0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s33, s33, 1
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v34, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-NEXT:    v_mov_b64_e32 v[16:17], s[16:17]
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], s[20:21]
+; GFX942-NEXT:    v_mov_b64_e32 v[22:23], s[22:23]
+; GFX942-NEXT:    v_mov_b64_e32 v[24:25], s[24:25]
+; GFX942-NEXT:    v_mov_b64_e32 v[26:27], s[26:27]
+; GFX942-NEXT:    v_mov_b64_e32 v[28:29], s[28:29]
+; GFX942-NEXT:    v_mov_b64_e32 v[30:31], s[30:31]
+; GFX942-NEXT:    s_set_gpr_idx_on s33, gpr_idx(SRC0)
+; GFX942-NEXT:    v_mov_b32_e32 v33, v1
+; GFX942-NEXT:    v_mov_b32_e32 v32, v0
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    global_store_dwordx2 v34, v[32:33], s[30:31]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <15 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0>, i32 %sel
   store double %ext, ptr addrspace(1) %out
@@ -572,6 +960,68 @@ define amdgpu_kernel void @double16_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[31:32]
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: double16_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx2 s[34:35], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s33, s[4:5], 0x2c
+; GFX942-NEXT:    s_mov_b32 s1, 0x3ff00000
+; GFX942-NEXT:    s_mov_b32 s0, 0
+; GFX942-NEXT:    s_mov_b32 s31, 0x40300000
+; GFX942-NEXT:    s_mov_b32 s29, 0x402e0000
+; GFX942-NEXT:    s_mov_b32 s27, 0x402c0000
+; GFX942-NEXT:    s_mov_b32 s25, 0x402a0000
+; GFX942-NEXT:    s_mov_b32 s23, 0x40280000
+; GFX942-NEXT:    s_mov_b32 s21, 0x40260000
+; GFX942-NEXT:    s_mov_b32 s19, 0x40240000
+; GFX942-NEXT:    s_mov_b32 s17, 0x40220000
+; GFX942-NEXT:    s_mov_b32 s15, 0x40200000
+; GFX942-NEXT:    s_mov_b32 s13, 0x401c0000
+; GFX942-NEXT:    s_mov_b32 s11, 0x40180000
+; GFX942-NEXT:    s_mov_b32 s9, 0x40140000
+; GFX942-NEXT:    s_mov_b32 s7, 0x40100000
+; GFX942-NEXT:    s_mov_b32 s5, 0x40080000
+; GFX942-NEXT:    s_mov_b32 s3, 2.0
+; GFX942-NEXT:    s_mov_b32 s2, s0
+; GFX942-NEXT:    s_mov_b32 s4, s0
+; GFX942-NEXT:    s_mov_b32 s6, s0
+; GFX942-NEXT:    s_mov_b32 s8, s0
+; GFX942-NEXT:    s_mov_b32 s10, s0
+; GFX942-NEXT:    s_mov_b32 s12, s0
+; GFX942-NEXT:    s_mov_b32 s14, s0
+; GFX942-NEXT:    s_mov_b32 s16, s0
+; GFX942-NEXT:    s_mov_b32 s18, s0
+; GFX942-NEXT:    s_mov_b32 s20, s0
+; GFX942-NEXT:    s_mov_b32 s22, s0
+; GFX942-NEXT:    s_mov_b32 s24, s0
+; GFX942-NEXT:    s_mov_b32 s26, s0
+; GFX942-NEXT:    s_mov_b32 s28, s0
+; GFX942-NEXT:    s_mov_b32 s30, s0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s33, s33, 1
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v34, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[4:5]
+; GFX942-NEXT:    v_mov_b64_e32 v[6:7], s[6:7]
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], s[8:9]
+; GFX942-NEXT:    v_mov_b64_e32 v[10:11], s[10:11]
+; GFX942-NEXT:    v_mov_b64_e32 v[12:13], s[12:13]
+; GFX942-NEXT:    v_mov_b64_e32 v[14:15], s[14:15]
+; GFX942-NEXT:    v_mov_b64_e32 v[16:17], s[16:17]
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], s[18:19]
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], s[20:21]
+; GFX942-NEXT:    v_mov_b64_e32 v[22:23], s[22:23]
+; GFX942-NEXT:    v_mov_b64_e32 v[24:25], s[24:25]
+; GFX942-NEXT:    v_mov_b64_e32 v[26:27], s[26:27]
+; GFX942-NEXT:    v_mov_b64_e32 v[28:29], s[28:29]
+; GFX942-NEXT:    v_mov_b64_e32 v[30:31], s[30:31]
+; GFX942-NEXT:    s_set_gpr_idx_on s33, gpr_idx(SRC0)
+; GFX942-NEXT:    v_mov_b32_e32 v33, v1
+; GFX942-NEXT:    v_mov_b32_e32 v32, v0
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    global_store_dwordx2 v34, v[32:33], s[34:35]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <16 x double> <double 1.0, double 2.0, double 3.0, double 4.0, double 5.0, double 6.0, double 7.0, double 8.0, double 9.0, double 10.0, double 11.0, double 12.0, double 13.0, double 14.0, double 15.0, double 16.0>, i32 %sel
   store double %ext, ptr addrspace(1) %out
@@ -622,6 +1072,50 @@ define amdgpu_kernel void @float32_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: float32_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    v_mov_b32_e32 v32, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GFX942-NEXT:    v_mov_b32_e32 v3, 4.0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x40a00000
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0x40c00000
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0x40e00000
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0x41000000
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0x41100000
+; GFX942-NEXT:    v_mov_b32_e32 v9, 0x41200000
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0x41300000
+; GFX942-NEXT:    v_mov_b32_e32 v11, 0x41400000
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0x41500000
+; GFX942-NEXT:    v_mov_b32_e32 v13, 0x41600000
+; GFX942-NEXT:    v_mov_b32_e32 v14, 0x41700000
+; GFX942-NEXT:    v_mov_b32_e32 v15, 0x41800000
+; GFX942-NEXT:    v_mov_b32_e32 v16, 0x41880000
+; GFX942-NEXT:    v_mov_b32_e32 v17, 0x41900000
+; GFX942-NEXT:    v_mov_b32_e32 v18, 0x41980000
+; GFX942-NEXT:    v_mov_b32_e32 v19, 0x41a00000
+; GFX942-NEXT:    v_mov_b32_e32 v20, 0x41a80000
+; GFX942-NEXT:    v_mov_b32_e32 v21, 0x41b00000
+; GFX942-NEXT:    v_mov_b32_e32 v22, 0x41b80000
+; GFX942-NEXT:    v_mov_b32_e32 v23, 0x41c00000
+; GFX942-NEXT:    v_mov_b32_e32 v24, 0x41c80000
+; GFX942-NEXT:    v_mov_b32_e32 v25, 0x41d00000
+; GFX942-NEXT:    v_mov_b32_e32 v26, 0x41d80000
+; GFX942-NEXT:    v_mov_b32_e32 v27, 0x41e00000
+; GFX942-NEXT:    v_mov_b32_e32 v28, 0x41e80000
+; GFX942-NEXT:    v_mov_b32_e32 v29, 0x41f00000
+; GFX942-NEXT:    v_mov_b32_e32 v30, 0x41f80000
+; GFX942-NEXT:    v_mov_b32_e32 v31, 0x42000000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_set_gpr_idx_on s2, gpr_idx(SRC0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, v0
+; GFX942-NEXT:    s_set_gpr_idx_off
+; GFX942-NEXT:    global_store_dword v32, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 32.0>, i32 %sel
   store float %ext, ptr addrspace(1) %out
@@ -643,6 +1137,20 @@ define amdgpu_kernel void @byte8_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_byte v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: byte8_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s2, 0x4030201
+; GFX942-NEXT:    s_mov_b32 s3, 0x8070605
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s4, s6, 3
+; GFX942-NEXT:    s_lshr_b64 s[2:3], s[2:3], s4
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <8 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, i32 %sel
   store i8 %ext, ptr addrspace(1) %out
@@ -690,6 +1198,46 @@ define amdgpu_kernel void @byte16_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_byte v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: byte16_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 1
+; GFX942-NEXT:    s_cselect_b32 s3, 2, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 2
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 3
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 3
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 4
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 4
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 5
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 5
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 6
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 6
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 7
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 7
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 8
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 8
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 9
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 9
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 10
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 10
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 11
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 11
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 12
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 12
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 13
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 13
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 14
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 14
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 15
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 15
+; GFX942-NEXT:    s_cselect_b32 s2, s3, 16
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_byte v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16>, i32 %sel
   store i8 %ext, ptr addrspace(1) %out
@@ -710,6 +1258,19 @@ define amdgpu_kernel void @bit4_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: bit4_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s2, s2, 3
+; GFX942-NEXT:    s_lshr_b32 s2, 0x1000100, s2
+; GFX942-NEXT:    s_and_b32 s2, s2, 1
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <4 x i1> <i1 0, i1 1, i1 0, i1 1>, i32 %sel
   %zext = zext i1 %ext to i32
@@ -985,6 +1546,273 @@ define amdgpu_kernel void @bit128_extelt(ptr addrspace(1) %out, i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    flat_store_dword v[0:1], v2
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: bit128_extelt:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 1
+; GFX942-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 2
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 3
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 4
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 5
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 6
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 7
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 8
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 9
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 10
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 11
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 12
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 13
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 14
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 15
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 16
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 17
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 18
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 19
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 20
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 21
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 22
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 23
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 24
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 25
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 26
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 27
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 28
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 29
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 30
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 31
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 32
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 33
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 34
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 35
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 36
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 37
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 38
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 39
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 40
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 41
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 42
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 43
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 44
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 45
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 46
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 47
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 48
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 49
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 50
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 51
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 52
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 53
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 54
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 55
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 56
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 57
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 58
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 59
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 60
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 61
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 62
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 63
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmp_lg_u32 s2, 64
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x41
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x42
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x43
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x44
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x45
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x46
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x47
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x48
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x49
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x4a
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x4b
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x4c
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x4d
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x4e
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x4f
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x50
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x51
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x52
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x53
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x54
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x55
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x56
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x57
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x58
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x59
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x5a
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x5b
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x5c
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x5d
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x5e
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x5f
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x60
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x61
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x62
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x63
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x64
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x65
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x66
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x67
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x68
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x69
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x6a
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x6b
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x6c
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x6d
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x6e
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x6f
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x70
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x71
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x72
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x73
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x74
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x75
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x76
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x77
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x78
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x79
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x7a
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x7b
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x7c
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x7d
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x7e
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 1
+; GFX942-NEXT:    s_cmpk_lg_i32 s2, 0x7f
+; GFX942-NEXT:    s_cselect_b32 s2, s3, 0
+; GFX942-NEXT:    s_and_b32 s2, s2, 1
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ext = extractelement <128 x i1> <i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0, i1 1, i1 0>, i32 %sel
   %zext = zext i1 %ext to i32
@@ -1088,6 +1916,133 @@ define float @float32_extelt_vec(i32 %sel) {
 ; GCN-NEXT:    v_cmp_ne_u32_e32 vcc, 31, v0
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: float32_extelt_vec:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x40400000
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, 1.0, 2.0, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 2, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 3, v0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x40a00000
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 4.0, v1, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 4, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x40c00000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 5, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x40e00000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 6, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41000000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 7, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41100000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 8, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41200000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 9, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41300000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 10, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41400000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 11, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41500000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 12, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41600000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 13, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41700000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 14, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41800000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 15, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41880000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 16, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41900000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 17, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41980000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 18, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41a00000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 19, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41a80000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 20, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41b00000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 21, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41b80000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 22, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41c00000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 23, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41c80000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 24, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41d00000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 25, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41d80000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 26, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41e00000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 27, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41e80000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 28, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41f00000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 29, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41f80000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 30, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x42000000
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 31, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <32 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0, float 9.0, float 10.0, float 11.0, float 12.0, float 13.0, float 14.0, float 15.0, float 16.0, float 17.0, float 18.0, float 19.0, float 20.0, float 21.0, float 22.0, float 23.0, float 24.0, float 25.0, float 26.0, float 27.0, float 28.0, float 29.0, float 30.0, float 31.0, float 32.0>, i32 %sel
   ret float %ext
@@ -1163,6 +2118,85 @@ define double @double16_extelt_vec(i32 %sel) {
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0x40301999
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: double16_extelt_vec:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x3ff19999
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x4000cccc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], 2, v0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x9999999a
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0xcccccccd
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x4008cccc
+; GFX942-NEXT:    s_or_b64 vcc, s[0:1], vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x40106666
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 3, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], 4, v0
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0x401c6666
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x40146666
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x40186666
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 5, v0
+; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 6, v0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x66666666
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX942-NEXT:    s_or_b64 vcc, vcc, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x40203333
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 7, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], 8, v0
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0x402e3333
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x40223333
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x40243333
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 9, v0
+; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x40263333
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 10, v0
+; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x40283333
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 11, v0
+; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x402a3333
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 12, v0
+; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x402c3333
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 13, v0
+; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 14, v0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0x33333333
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX942-NEXT:    s_or_b64 vcc, vcc, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 15, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x40301999
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ext = extractelement <16 x double> <double 1.1, double 2.1, double 3.1, double 4.1, double 5.1, double 6.1, double 7.1, double 8.1, double 9.1, double 10.1, double 11.1, double 12.1, double 13.1, double 14.1, double 15.1, double 16.1>, i32 %sel
   ret double %ext
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index d32b528d13276..8d31b13765342 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -check-prefixes=GFX678,GFX6 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -check-prefixes=GFX678,GFX8 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX12,GFX12-TRUE16 %s
@@ -49,6 +50,17 @@ define amdgpu_kernel void @v_test_canonicalize_var_f32(ptr addrspace(1) %out) #1
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: v_test_canonicalize_var_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: v_test_canonicalize_var_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -115,6 +127,16 @@ define amdgpu_kernel void @s_test_canonicalize_var_f32(ptr addrspace(1) %out, fl
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: s_test_canonicalize_var_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_max_f32_e64 v1, s2, s2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: s_test_canonicalize_var_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_clause 0x1
@@ -166,6 +188,17 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(ptr addrspace(1) %ou
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: v_test_canonicalize_fabs_var_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f32_e64 v1, |v1|, |v1|
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: v_test_canonicalize_fabs_var_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -221,6 +254,17 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(ptr addrspace(1
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f32_e64 v1, -|v1|, -|v1|
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -277,6 +321,17 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(ptr addrspace(1) %ou
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: v_test_canonicalize_fneg_var_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f32_e64 v1, -v1, -v1
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: v_test_canonicalize_fneg_var_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -327,6 +382,14 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_f32(ptr addrspace(1) %ou
 ; GFX9-NEXT:    global_store_dword v0, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_undef_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_undef_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -369,6 +432,14 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f32(ptr addrspace(1) %out)
 ; GFX9-NEXT:    global_store_dword v0, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_p0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_p0_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -412,6 +483,15 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f32(ptr addrspace(1) %out)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_n0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_n0_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -457,6 +537,15 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f32(ptr addrspace(1) %out)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_p1_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_p1_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -500,6 +589,15 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f32(ptr addrspace(1) %out)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_n1_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, -1.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_n1_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -543,6 +641,15 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f32(ptr addrspace(1) %
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_literal_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x41800000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_literal_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -585,6 +692,14 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(ptr
 ; GFX9-NEXT:    global_store_dword v0, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -630,6 +745,16 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s2, 0x7fffff
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_max_f32_e64 v1, s2, s2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -677,6 +802,16 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s2, 0x7fffff
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_max_f32_e64 v1, s2, s2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_out:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -724,6 +859,16 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32_dyn
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s2, 0x7fffff
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_max_f32_e64 v1, s2, s2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f32_dynamic_in:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -769,6 +914,15 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(ptr ad
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x7fffff
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -812,6 +966,15 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(ptr
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -857,6 +1020,15 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(ptr ad
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x807fffff
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -900,6 +1072,15 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(ptr addrspace(1) %out
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_qnan_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_qnan_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -943,6 +1124,15 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(ptr addrsp
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -986,6 +1176,15 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(ptr addrsp
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1029,6 +1228,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(ptr addrspace(
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_snan0_value_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1072,6 +1280,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(ptr addrspace(
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_snan1_value_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1115,6 +1332,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(ptr addrspace(
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_snan2_value_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1158,6 +1384,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(ptr addrspace(
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_snan3_value_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1205,6 +1440,17 @@ define amdgpu_kernel void @v_test_canonicalize_var_f64(ptr addrspace(1) %out) #1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: v_test_canonicalize_var_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: v_test_canonicalize_var_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1268,6 +1514,15 @@ define amdgpu_kernel void @s_test_canonicalize_var_f64(ptr addrspace(1) %out, do
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: s_test_canonicalize_var_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_max_f64 v[0:1], s[2:3], s[2:3]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: s_test_canonicalize_var_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
@@ -1317,6 +1572,17 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(ptr addrspace(1) %ou
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: v_test_canonicalize_fabs_var_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f64 v[0:1], |v[0:1]|, |v[0:1]|
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: v_test_canonicalize_fabs_var_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1372,6 +1638,17 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(ptr addrspace(1
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f64 v[0:1], -|v[0:1]|, -|v[0:1]|
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: v_test_canonicalize_fneg_fabs_var_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1428,6 +1705,17 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(ptr addrspace(1) %ou
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: v_test_canonicalize_fneg_var_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f64 v[0:1], -v[0:1], -v[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: v_test_canonicalize_fneg_var_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1480,6 +1768,15 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_f64(ptr addrspace(1) %out)
 ; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_p0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_p0_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1528,6 +1825,15 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_f64(ptr addrspace(1) %out)
 ; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_n0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_n0_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1574,6 +1880,15 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_f64(ptr addrspace(1) %out)
 ; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_p1_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_p1_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1618,6 +1933,15 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_f64(ptr addrspace(1) %out)
 ; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_n1_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0xbff00000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_n1_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1662,6 +1986,15 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_f64(ptr addrspace(1) %
 ; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_literal_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x40300000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_literal_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1706,6 +2039,15 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(ptr
 ; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal0_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1755,6 +2097,16 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(ptr ad
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, -1
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0xfffff
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal0_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1801,6 +2153,15 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(ptr
 ; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_no_denormals_fold_canonicalize_denormal1_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1848,6 +2209,16 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(ptr ad
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, -1
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x800fffff
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_denormals_fold_canonicalize_denormal1_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1894,6 +2265,15 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(ptr addrspace(1) %out
 ; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_qnan_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_qnan_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1938,6 +2318,15 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(ptr addrsp
 ; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg1_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -1982,6 +2371,15 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(ptr addrsp
 ; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_qnan_value_neg2_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -2026,6 +2424,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(ptr addrspace(
 ; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_snan0_value_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_snan0_value_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -2070,6 +2477,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(ptr addrspace(
 ; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_snan1_value_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_snan1_value_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -2114,6 +2530,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(ptr addrspace(
 ; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_snan2_value_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_snan2_value_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -2158,6 +2583,15 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(ptr addrspace(
 ; GFX9-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_fold_canonicalize_snan3_value_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_fold_canonicalize_snan3_value_f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -2230,6 +2664,18 @@ define amdgpu_kernel void @test_canonicalize_value_f64_flush(ptr addrspace(1) %a
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_canonicalize_value_f64_flush:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_canonicalize_value_f64_flush:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
@@ -2316,6 +2762,18 @@ define amdgpu_kernel void @test_canonicalize_value_f32_flush(ptr addrspace(1) %a
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_canonicalize_value_f32_flush:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_canonicalize_value_f32_flush:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
@@ -2403,6 +2861,18 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
 ; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_canonicalize_value_f16_flush:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX942-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-TRUE16-LABEL: test_canonicalize_value_f16_flush:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
@@ -2525,6 +2995,18 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_canonicalize_value_v2f16_flush:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_canonicalize_value_v2f16_flush:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
@@ -2611,6 +3093,18 @@ define amdgpu_kernel void @test_canonicalize_value_f64_denorm(ptr addrspace(1) %
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_canonicalize_value_f64_denorm:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_canonicalize_value_f64_denorm:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
@@ -2697,6 +3191,18 @@ define amdgpu_kernel void @test_canonicalize_value_f32_denorm(ptr addrspace(1) %
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_canonicalize_value_f32_denorm:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_canonicalize_value_f32_denorm:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
@@ -2785,6 +3291,18 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
 ; GFX9-NEXT:    global_store_short v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_canonicalize_value_f16_denorm:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_ushort v1, v0, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f16_e32 v1, v1, v1
+; GFX942-NEXT:    global_store_short v0, v1, s[2:3]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-TRUE16-LABEL: test_canonicalize_value_f16_denorm:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
@@ -2907,6 +3425,18 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
 ; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: test_canonicalize_value_v2f16_denorm:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v0, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_pk_max_f16 v1, v1, v1
+; GFX942-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: test_canonicalize_value_v2f16_denorm:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
@@ -2995,6 +3525,20 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f64(ptr addrspace(1) %out)
 ; GFX9-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: v_test_canonicalize_var_v2f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX942-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX11-LABEL: v_test_canonicalize_var_v2f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -3048,6 +3592,13 @@ define <2 x float> @v_test_canonicalize_v2f32_flush(<2 x float> %arg) #1 {
 ; GFX9-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: v_test_canonicalize_v2f32_flush:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX942-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: v_test_canonicalize_v2f32_flush:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3085,6 +3636,14 @@ define <3 x float> @v_test_canonicalize_v3f32_flush(<3 x float> %arg) #1 {
 ; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: v_test_canonicalize_v3f32_flush:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX942-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX942-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: v_test_canonicalize_v3f32_flush:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3126,6 +3685,15 @@ define <4 x float> @v_test_canonicalize_v4f32_flush(<4 x float> %arg) #1 {
 ; GFX9-NEXT:    v_max_f32_e32 v3, v3, v3
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: v_test_canonicalize_v4f32_flush:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX942-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX942-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX942-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: v_test_canonicalize_v4f32_flush:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3175,6 +3743,19 @@ define <8 x float> @v_test_canonicalize_v8f32_flush(<8 x float> %arg) #1 {
 ; GFX9-NEXT:    v_max_f32_e32 v7, v7, v7
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: v_test_canonicalize_v8f32_flush:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f32_e32 v0, v0, v0
+; GFX942-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX942-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX942-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-NEXT:    v_max_f32_e32 v4, v4, v4
+; GFX942-NEXT:    v_max_f32_e32 v5, v5, v5
+; GFX942-NEXT:    v_max_f32_e32 v6, v6, v6
+; GFX942-NEXT:    v_max_f32_e32 v7, v7, v7
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: v_test_canonicalize_v8f32_flush:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3215,6 +3796,13 @@ define <2 x double> @v_test_canonicalize_v2f64(<2 x double> %arg) #1 {
 ; GFX9-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: v_test_canonicalize_v2f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX942-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: v_test_canonicalize_v2f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3253,6 +3841,14 @@ define <3 x double> @v_test_canonicalize_v3f64(<3 x double> %arg) #1 {
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: v_test_canonicalize_v3f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX942-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX942-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: v_test_canonicalize_v3f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3295,6 +3891,15 @@ define <4 x double> @v_test_canonicalize_v4f64(<4 x double> %arg) #1 {
 ; GFX9-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: v_test_canonicalize_v4f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_max_f64 v[0:1], v[0:1], v[0:1]
+; GFX942-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX942-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX942-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX11-LABEL: v_test_canonicalize_v4f64:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll
index bd1f98a39c252..931d2f54edec4 100644
--- a/llvm/test/CodeGen/AMDGPU/fceil64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=CI -check-prefix=FUNC %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=CI -check-prefix=FUNC %s
 
@@ -44,6 +45,15 @@ define amdgpu_kernel void @fceil_f64(ptr addrspace(1) %out, double %x) {
 ; SI-NEXT:    v_add_f64 v[0:1], s[8:9], v[0:1]
 ; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: fceil_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_ceil_f64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %y = call double @llvm.ceil.f64(double %x) nounwind readnone
   store double %y, ptr addrspace(1) %out
   ret void
@@ -102,6 +112,17 @@ define amdgpu_kernel void @fceil_v2f64(ptr addrspace(1) %out, <2 x double> %x) {
 ; SI-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; SI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: fceil_v2f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_ceil_f64_e32 v[2:3], s[2:3]
+; GFX942-NEXT:    v_ceil_f64_e32 v[0:1], s[0:1]
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, ptr addrspace(1) %out
   ret void
@@ -211,6 +232,20 @@ define amdgpu_kernel void @fceil_v4f64(ptr addrspace(1) %out, <4 x double> %x) {
 ; SI-NEXT:    buffer_store_dwordx4 v[6:9], off, s[8:11], 0 offset:16
 ; SI-NEXT:    buffer_store_dwordx4 v[2:5], off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: fceil_v4f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_ceil_f64_e32 v[6:7], s[14:15]
+; GFX942-NEXT:    v_ceil_f64_e32 v[4:5], s[12:13]
+; GFX942-NEXT:    v_ceil_f64_e32 v[2:3], s[10:11]
+; GFX942-NEXT:    v_ceil_f64_e32 v[0:1], s[8:9]
+; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, ptr addrspace(1) %out
   ret void
@@ -394,6 +429,26 @@ define amdgpu_kernel void @fceil_v8f64(ptr addrspace(1) %out, <8 x double> %x) {
 ; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:48
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[16:19], 0
 ; SI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: fceil_v8f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_ceil_f64_e32 v[14:15], s[22:23]
+; GFX942-NEXT:    v_ceil_f64_e32 v[12:13], s[20:21]
+; GFX942-NEXT:    v_ceil_f64_e32 v[2:3], s[10:11]
+; GFX942-NEXT:    v_ceil_f64_e32 v[0:1], s[8:9]
+; GFX942-NEXT:    v_ceil_f64_e32 v[6:7], s[14:15]
+; GFX942-NEXT:    v_ceil_f64_e32 v[4:5], s[12:13]
+; GFX942-NEXT:    v_ceil_f64_e32 v[10:11], s[18:19]
+; GFX942-NEXT:    v_ceil_f64_e32 v[8:9], s[16:17]
+; GFX942-NEXT:    global_store_dwordx4 v16, v[12:15], s[0:1] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v16, v[8:11], s[0:1] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v16, v[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v16, v[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, ptr addrspace(1) %out
   ret void
@@ -747,6 +802,39 @@ define amdgpu_kernel void @fceil_v16f64(ptr addrspace(1) %out, <16 x double> %x)
 ; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[24:27], 0 offset:112
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[24:27], 0
 ; SI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: fceil_v16f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; GFX942-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX942-NEXT:    v_mov_b32_e32 v32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_ceil_f64_e32 v[30:31], s[22:23]
+; GFX942-NEXT:    v_ceil_f64_e32 v[28:29], s[20:21]
+; GFX942-NEXT:    v_ceil_f64_e32 v[2:3], s[38:39]
+; GFX942-NEXT:    v_ceil_f64_e32 v[0:1], s[36:37]
+; GFX942-NEXT:    v_ceil_f64_e32 v[6:7], s[42:43]
+; GFX942-NEXT:    v_ceil_f64_e32 v[4:5], s[40:41]
+; GFX942-NEXT:    v_ceil_f64_e32 v[10:11], s[46:47]
+; GFX942-NEXT:    v_ceil_f64_e32 v[8:9], s[44:45]
+; GFX942-NEXT:    v_ceil_f64_e32 v[14:15], s[50:51]
+; GFX942-NEXT:    v_ceil_f64_e32 v[12:13], s[48:49]
+; GFX942-NEXT:    v_ceil_f64_e32 v[18:19], s[10:11]
+; GFX942-NEXT:    v_ceil_f64_e32 v[16:17], s[8:9]
+; GFX942-NEXT:    v_ceil_f64_e32 v[22:23], s[14:15]
+; GFX942-NEXT:    v_ceil_f64_e32 v[20:21], s[12:13]
+; GFX942-NEXT:    v_ceil_f64_e32 v[26:27], s[18:19]
+; GFX942-NEXT:    v_ceil_f64_e32 v[24:25], s[16:17]
+; GFX942-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX942-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX942-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX942-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX942-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
index 607ed85274e40..bf87b01d84736 100644
--- a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN,GFX942 %s
 
 @lds = internal addrspace(3) global [576 x double] poison, align 16
 
@@ -7,28 +8,50 @@
 ; block. When sorted by offset, the merges would fail. We should form
 ; two groupings of ds_write2_b64 on either side of the fence.
 define amdgpu_kernel void @same_address_fence_merge_write2() #0 {
-; GCN-LABEL: same_address_fence_merge_write2:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
-; GCN-NEXT:    s_mov_b32 s1, 0x40100000
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    v_add_u32_e32 v3, 0x800, v2
-; GCN-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:66
-; GCN-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198
-; GCN-NEXT:    ds_write2_b64 v3, v[0:1], v[0:1] offset0:8 offset1:74
-; GCN-NEXT:    ds_write2_b64 v3, v[0:1], v[0:1] offset0:140 offset1:206
-; GCN-NEXT:    s_mov_b32 s1, 0x3ff00000
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_barrier
-; GCN-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:66
-; GCN-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198
-; GCN-NEXT:    ds_write2_b64 v3, v[0:1], v[0:1] offset0:8 offset1:74
-; GCN-NEXT:    ds_write2_b64 v3, v[0:1], v[0:1] offset0:140 offset1:206
-; GCN-NEXT:    s_endpgm
+; GFX9-LABEL: same_address_fence_merge_write2:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_mov_b32 s0, 0
+; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX9-NEXT:    s_mov_b32 s1, 0x40100000
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_add_u32_e32 v3, 0x800, v2
+; GFX9-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:66
+; GFX9-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198
+; GFX9-NEXT:    ds_write2_b64 v3, v[0:1], v[0:1] offset0:8 offset1:74
+; GFX9-NEXT:    ds_write2_b64 v3, v[0:1], v[0:1] offset0:140 offset1:206
+; GFX9-NEXT:    s_mov_b32 s1, 0x3ff00000
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_barrier
+; GFX9-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:66
+; GFX9-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198
+; GFX9-NEXT:    ds_write2_b64 v3, v[0:1], v[0:1] offset0:8 offset1:74
+; GFX9-NEXT:    ds_write2_b64 v3, v[0:1], v[0:1] offset0:140 offset1:206
+; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: same_address_fence_merge_write2:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX942-NEXT:    v_and_b32_e32 v2, 0x1ff8, v0
+; GFX942-NEXT:    s_mov_b32 s1, 0x40100000
+; GFX942-NEXT:    s_mov_b32 s0, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT:    v_add_u32_e32 v3, 0x800, v2
+; GFX942-NEXT:    s_mov_b32 s1, 0x3ff00000
+; GFX942-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:66
+; GFX942-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198
+; GFX942-NEXT:    ds_write2_b64 v3, v[0:1], v[0:1] offset0:8 offset1:74
+; GFX942-NEXT:    ds_write2_b64 v3, v[0:1], v[0:1] offset0:140 offset1:206
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_barrier
+; GFX942-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset1:66
+; GFX942-NEXT:    ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198
+; GFX942-NEXT:    ds_write2_b64 v3, v[0:1], v[0:1] offset0:8 offset1:74
+; GFX942-NEXT:    ds_write2_b64 v3, v[0:1], v[0:1] offset0:140 offset1:206
+; GFX942-NEXT:    s_endpgm
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
   %tmp1 = getelementptr inbounds [576 x double], ptr addrspace(3) @lds, i32 0, i32 %tmp
@@ -68,3 +91,5 @@ attributes #0 = { nounwind readnone speculatable }
 attributes #1 = { convergent nounwind }
 
 !0 = !{i32 0, i32 1024}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 189b897793381..b258ac1edcbea 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942,GFX942-GISEL %s
 
 define i128 @fptosi_f64_to_i128(double %x) {
 ; SDAG-LABEL: fptosi_f64_to_i128:
@@ -362,6 +364,388 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; GISEL-NEXT:  .LBB0_10: ; %fp-to-i-cleanup
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-SDAG-LABEL: fptosi_f64_to_i128:
+; GFX942-SDAG:       ; %bb.0: ; %fp-to-i-entry
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-SDAG-NEXT:    v_bfe_u32 v8, v5, 20, 11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0x3fe
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB0_10
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v8
+; GFX942-SDAG-NEXT:    s_movk_i32 s2, 0xff7f
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v9, vcc
+; GFX942-SDAG-NEXT:    s_mov_b32 s3, -1
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v9, vcc
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v9, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, -1, v[2:3]
+; GFX942-SDAG-NEXT:    v_cmp_lt_i64_e64 s[0:1], -1, v[4:5]
+; GFX942-SDAG-NEXT:    s_and_b64 s[2:3], vcc, s[2:3]
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[2:3]
+; GFX942-SDAG-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB0_7
+; GFX942-SDAG-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GFX942-SDAG-NEXT:    s_mov_b32 s2, 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, -1
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0xfffff, v5
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0x432
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v10, -1, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v11, -1, 1, s[0:1]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v5, 0x100000, v0
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[8:9]
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[2:3]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB0_4
+; GFX942-SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v7, 0xfffffbcd, v8
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v0, 0x473, v8
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v2, 0xfffffb8d, v8
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[4:5]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[4:5]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX942-SDAG-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v7
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[2:3]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v9, 0, v0, vcc
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v4, v10, v2
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v5, v11, v8
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v11, v2, 0
+; GFX942-SDAG-NEXT:    v_add3_u32 v1, v1, v5, v4
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], v6, v9, v[0:1]
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v0, v6, v12
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v1, v6, v9
+; GFX942-SDAG-NEXT:    v_add3_u32 v5, v1, v5, v0
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v9, v11, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[2:3], v12, v11, v[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[2:3], v9, v10, v[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[8:9]
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], v12, v10, v[2:3]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr11
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr10
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX942-SDAG-NEXT:  .LBB0_4: ; %Flow
+; GFX942-SDAG-NEXT:    s_andn2_saveexec_b64 s[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB0_6
+; GFX942-SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v2, 0x433, v8
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v12, v0, v4, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v7, v1, v5, s[2:3]
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v12, v11, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], v7, v11, v[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], v12, v10, v[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[8:9]
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], v7, v10, v[2:3]
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], v6, v12, v[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v3
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[2:3], v6, v12, v[8:9]
+; GFX942-SDAG-NEXT:    v_mad_i32_i24 v3, v6, v7, v8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; GFX942-SDAG-NEXT:  .LBB0_6: ; %Flow1
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-SDAG-NEXT:  .LBB0_7: ; %Flow2
+; GFX942-SDAG-NEXT:    s_andn2_saveexec_b64 s[2:3], s[6:7]
+; GFX942-SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GFX942-SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX942-SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-SDAG-NEXT:  ; %bb.9: ; %Flow3
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:  .LBB0_10: ; %fp-to-i-cleanup
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fptosi_f64_to_i128:
+; GFX942-GISEL:       ; %bb.0: ; %fp-to-i-entry
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 20, v5
+; GFX942-GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v6, 0x7ff, v0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], 0x3ff
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB0_10
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v1, s[4:5], 0, -1, vcc
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v9, -1
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v2, s[4:5], 0, -1, s[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[8:9]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, -1, s[4:5]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_le_u64_e32 vcc, -1, v[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_lt_i64_e64 s[0:1], -1, v[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, -1, v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB0_7
+; GFX942-GISEL-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GFX942-GISEL-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 1, v0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 2, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 3, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v9, 4, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v10, 5, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v11, 6, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v12, 7, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v13, 8, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v14, 9, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v15, 10, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v16, 11, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v17, 12, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v18, 13, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v19, 14, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v20, 15, v0
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v10
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v11
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v12
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v12
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v13
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v13
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v15
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v16
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v17
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v17
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v18
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v18
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v19
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v19
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v20
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v20
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-GISEL-NEXT:    v_lshl_or_b32 v9, v0, 16, v0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0xfffff, v5
+; GFX942-GISEL-NEXT:    v_or3_b32 v8, v1, v2, 1
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v5, 0x100000, v0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], 0x433
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB0_4
+; GFX942-GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v7, 0xfffffbcd, v6
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v6, 0xfffffb8d, v6
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v1, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v6, v9, 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v7, v0, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v11, v1, 0, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v6, v8, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v10, v9, v[2:3]
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v6, v6, v9
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v12, v10, v9
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], vcc, v10, v8, v[2:3]
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v7, v8, v[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v12, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v7, v9, v[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, v4, v3, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr6
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr8
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr9
+; GFX942-GISEL-NEXT:  .LBB0_4: ; %Flow
+; GFX942-GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB0_6
+; GFX942-GISEL-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v2, vcc, 0x433, v6
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v6, v0, v4, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v7, v1, v5, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v6, v9, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v6, v8, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v7, v9, v[2:3]
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v6, v6, v9
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v9, v7, v9
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], vcc, v7, v8, v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v9, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, v4, v3, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
+; GFX942-GISEL-NEXT:  .LBB0_6: ; %Flow1
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX942-GISEL-NEXT:  .LBB0_7: ; %Flow2
+; GFX942-GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB0_9
+; GFX942-GISEL-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 4, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 6, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 7, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 9, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 10, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v12, 11, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 12, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 13, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v15, 14, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v16, 15, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v18, 17, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v19, 18, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v20, 19, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v21, 20, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v22, 21, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v23, 22, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v24, 23, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v25, 24, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v26, 25, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v27, 26, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v28, 27, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v29, 28, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v30, 29, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v31, 30, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 31, v1
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v3, v4
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v5, v6
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v9, v10
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v11, v12
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v13, v14
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v13, v14
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v15, v16
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v15, v16
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v17, v18
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v17, v18
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v19, v20
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v19, v20
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v21, v22
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v21, v22
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v23, v24
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v23, v24
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v25, v26
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v25, v26
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v27, v28
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v27, v28
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v29, v30
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v29, v30
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v31, v32
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v31, v32
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v3, 0x80000000, v1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-GISEL-NEXT:  .LBB0_9: ; %Flow3
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:  .LBB0_10: ; %fp-to-i-cleanup
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cvt = fptosi double %x to i128
   ret i128 %cvt
 }
@@ -726,6 +1110,388 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; GISEL-NEXT:  .LBB1_10: ; %fp-to-i-cleanup
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-SDAG-LABEL: fptoui_f64_to_i128:
+; GFX942-SDAG:       ; %bb.0: ; %fp-to-i-entry
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-SDAG-NEXT:    v_bfe_u32 v8, v5, 20, 11
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-SDAG-NEXT:    s_mov_b64 s[0:1], 0x3fe
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[0:1], v[8:9]
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB1_10
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v8
+; GFX942-SDAG-NEXT:    s_movk_i32 s2, 0xff7f
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v9, vcc
+; GFX942-SDAG-NEXT:    s_mov_b32 s3, -1
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v9, vcc
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v9, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, -1, v[2:3]
+; GFX942-SDAG-NEXT:    v_cmp_lt_i64_e64 s[0:1], -1, v[4:5]
+; GFX942-SDAG-NEXT:    s_and_b64 s[2:3], vcc, s[2:3]
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[2:3]
+; GFX942-SDAG-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB1_7
+; GFX942-SDAG-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GFX942-SDAG-NEXT:    s_mov_b32 s2, 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, -1
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0xfffff, v5
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0x432
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v10, -1, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v11, -1, 1, s[0:1]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v5, 0x100000, v0
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[8:9]
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[2:3]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB1_4
+; GFX942-SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v7, 0xfffffbcd, v8
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v0, 0x473, v8
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v2, 0xfffffb8d, v8
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[4:5]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[4:5]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX942-SDAG-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v7
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v8, 0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[2:3]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v9, 0, v0, vcc
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v4, v10, v2
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v5, v11, v8
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v11, v2, 0
+; GFX942-SDAG-NEXT:    v_add3_u32 v1, v1, v5, v4
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], v6, v9, v[0:1]
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v0, v6, v12
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v1, v6, v9
+; GFX942-SDAG-NEXT:    v_add3_u32 v5, v1, v5, v0
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v9, v11, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[2:3], v12, v11, v[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[2:3], v9, v10, v[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[8:9]
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], v12, v10, v[2:3]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr11
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr10
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX942-SDAG-NEXT:  .LBB1_4: ; %Flow
+; GFX942-SDAG-NEXT:    s_andn2_saveexec_b64 s[8:9], s[8:9]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB1_6
+; GFX942-SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v2, 0x433, v8
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v12, v0, v4, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v7, v1, v5, s[2:3]
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v12, v11, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], v7, v11, v[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], v12, v10, v[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[8:9]
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], v7, v10, v[2:3]
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], v6, v12, v[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v3
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[8:9], s[2:3], v6, v12, v[8:9]
+; GFX942-SDAG-NEXT:    v_mad_i32_i24 v3, v6, v7, v8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; GFX942-SDAG-NEXT:  .LBB1_6: ; %Flow1
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[8:9]
+; GFX942-SDAG-NEXT:  .LBB1_7: ; %Flow2
+; GFX942-SDAG-NEXT:    s_andn2_saveexec_b64 s[2:3], s[6:7]
+; GFX942-SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GFX942-SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX942-SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-SDAG-NEXT:  ; %bb.9: ; %Flow3
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:  .LBB1_10: ; %fp-to-i-cleanup
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fptoui_f64_to_i128:
+; GFX942-GISEL:       ; %bb.0: ; %fp-to-i-entry
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-GISEL-NEXT:    v_lshrrev_b32_e32 v0, 20, v5
+; GFX942-GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v6, 0x7ff, v0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], 0x3ff
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB1_10
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v1, s[4:5], 0, -1, vcc
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v9, -1
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v2, s[4:5], 0, -1, s[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[8:9]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, -1, s[4:5]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_le_u64_e32 vcc, -1, v[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_lt_i64_e64 s[0:1], -1, v[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, -1, v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB1_7
+; GFX942-GISEL-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GFX942-GISEL-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 1, v0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 2, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 3, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v9, 4, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v10, 5, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v11, 6, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v12, 7, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v13, 8, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v14, 9, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v15, 10, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v16, 11, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v17, 12, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v18, 13, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v19, 14, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v20, 15, v0
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v10
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v11
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v12
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v12
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v13
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v13
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v15
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v16
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v17
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v17
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v18
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v18
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v19
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v19
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v20
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v20
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-GISEL-NEXT:    v_lshl_or_b32 v9, v0, 16, v0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0xfffff, v5
+; GFX942-GISEL-NEXT:    v_or3_b32 v8, v1, v2, 1
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v5, 0x100000, v0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], 0x433
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB1_4
+; GFX942-GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v7, 0xfffffbcd, v6
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v6, 0xfffffb8d, v6
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v1, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v6, v9, 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v7, v0, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v11, v1, 0, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v6, v8, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v10, v9, v[2:3]
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v6, v6, v9
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v12, v10, v9
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], vcc, v10, v8, v[2:3]
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v7, v8, v[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v12, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v7, v9, v[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, v4, v3, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr6
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr8
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr9
+; GFX942-GISEL-NEXT:  .LBB1_4: ; %Flow
+; GFX942-GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB1_6
+; GFX942-GISEL-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v2, vcc, 0x433, v6
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v6, v0, v4, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v7, v1, v5, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v6, v9, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v6, v8, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v7, v9, v[2:3]
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v6, v6, v9
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v9, v7, v9
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], vcc, v7, v8, v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v9, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, v4, v3, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
+; GFX942-GISEL-NEXT:  .LBB1_6: ; %Flow1
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX942-GISEL-NEXT:  .LBB1_7: ; %Flow2
+; GFX942-GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB1_9
+; GFX942-GISEL-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 4, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 6, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 7, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 9, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 10, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v12, 11, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 12, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 13, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v15, 14, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v16, 15, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v18, 17, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v19, 18, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v20, 19, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v21, 20, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v22, 21, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v23, 22, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v24, 23, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v25, 24, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v26, 25, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v27, 26, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v28, 27, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v29, 28, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v30, 29, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v31, 30, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 31, v1
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v3, v4
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v5, v6
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v9, v10
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v11, v12
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v13, v14
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v13, v14
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v15, v16
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v15, v16
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v17, v18
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v17, v18
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v19, v20
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v19, v20
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v21, v22
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v21, v22
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v23, v24
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v23, v24
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v25, v26
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v25, v26
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v27, v28
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v27, v28
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v29, v30
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v29, v30
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v31, v32
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v31, v32
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v3, 0x80000000, v1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-GISEL-NEXT:  .LBB1_9: ; %Flow3
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:  .LBB1_10: ; %fp-to-i-cleanup
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cvt = fptoui double %x to i128
   ret i128 %cvt
 }
@@ -1077,6 +1843,370 @@ define i128 @fptosi_f32_to_i128(float %x) {
 ; GISEL-NEXT:  .LBB2_10: ; %fp-to-i-cleanup
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-SDAG-LABEL: fptosi_f32_to_i128:
+; GFX942-SDAG:       ; %bb.0: ; %fp-to-i-entry
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-SDAG-NEXT:    v_bfe_u32 v6, v4, 23, 8
+; GFX942-SDAG-NEXT:    s_movk_i32 s0, 0x7e
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB2_10
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; GFX942-SDAG-NEXT:    s_movk_i32 s2, 0xff7f
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
+; GFX942-SDAG-NEXT:    s_mov_b32 s3, -1
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, -1, v[2:3]
+; GFX942-SDAG-NEXT:    v_cmp_lt_i32_e64 s[0:1], -1, v4
+; GFX942-SDAG-NEXT:    s_and_b64 s[2:3], vcc, s[2:3]
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[2:3]
+; GFX942-SDAG-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB2_7
+; GFX942-SDAG-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GFX942-SDAG-NEXT:    s_mov_b32 s2, 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[8:9], v[0:1], 0, -1
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffff, v4
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0x95
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v10, -1, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v11, -1, 1, s[0:1]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v4, 0x800000, v0
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[6:7]
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[2:3]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB2_4
+; GFX942-SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v7, 0xffffff6a, v6
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v0, 0xd6, v6
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff2a, v6
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[4:5]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[4:5]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX942-SDAG-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v7
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[2:3]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v9, 0, v0, vcc
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v4, v10, v2
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v3, v11, v3
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v11, v2, 0
+; GFX942-SDAG-NEXT:    v_add3_u32 v1, v1, v3, v4
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], v8, v9, v[0:1]
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v0, v8, v12
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v1, v8, v9
+; GFX942-SDAG-NEXT:    v_add3_u32 v3, v1, v3, v0
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v9, v11, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[2:3], v12, v11, v[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[2:3], v9, v10, v[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, v5
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[8:9]
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], v12, v10, v[4:5]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr11
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr10
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX942-SDAG-NEXT:  .LBB2_4: ; %Flow
+; GFX942-SDAG-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB2_6
+; GFX942-SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v2, 0x96, v6
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v6, v0, v4, vcc
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v6, v11, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v6, v10, v[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; GFX942-SDAG-NEXT:    v_mad_i64_i32 v[2:3], s[8:9], v8, v6, v[2:3]
+; GFX942-SDAG-NEXT:  .LBB2_6: ; %Flow1
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:  .LBB2_7: ; %Flow2
+; GFX942-SDAG-NEXT:    s_andn2_saveexec_b64 s[2:3], s[6:7]
+; GFX942-SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GFX942-SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX942-SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-SDAG-NEXT:  ; %bb.9: ; %Flow3
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:  .LBB2_10: ; %fp-to-i-cleanup
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fptosi_f32_to_i128:
+; GFX942-GISEL:       ; %bb.0: ; %fp-to-i-entry
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[0:1], 23, v[4:5]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT:    v_bfe_u32 v6, v0, 0, 8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], 0x7f
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB2_10
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v1, s[4:5], 0, -1, vcc
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v9, -1
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v2, s[4:5], 0, -1, s[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[8:9]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, -1, s[4:5]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_le_u64_e32 vcc, -1, v[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_lt_i32_e64 s[0:1], -1, v4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, -1, v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB2_7
+; GFX942-GISEL-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GFX942-GISEL-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 1, v0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 2, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 3, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v9, 4, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v10, 5, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v11, 6, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v12, 7, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v13, 8, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v14, 9, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v15, 10, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v16, 11, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v17, 12, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v18, 13, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v19, 14, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v20, 15, v0
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v10
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v11
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v12
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v12
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v13
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v13
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v15
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v16
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v17
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v17
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v18
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v18
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v19
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v19
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v20
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v20
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-GISEL-NEXT:    v_lshl_or_b32 v9, v0, 16, v0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffff, v4
+; GFX942-GISEL-NEXT:    v_or3_b32 v8, v1, v2, 1
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v4, 0x800000, v0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], 0x96
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB2_4
+; GFX942-GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v7, 0xffffff6a, v6
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff2a, v6
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v1, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v6, v9, 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v7, v0, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v11, v1, 0, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v6, v8, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v10, v9, v[2:3]
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v6, v6, v9
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v12, v10, v9
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], vcc, v10, v8, v[2:3]
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v7, v8, v[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v12, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v7, v9, v[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, v4, v3, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr6
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr8
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr9
+; GFX942-GISEL-NEXT:  .LBB2_4: ; %Flow
+; GFX942-GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB2_6
+; GFX942-GISEL-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v2, vcc, 0x96, v6
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v2, v8, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v2, v9, 0
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v4
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, v4, v5, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
+; GFX942-GISEL-NEXT:  .LBB2_6: ; %Flow1
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX942-GISEL-NEXT:  .LBB2_7: ; %Flow2
+; GFX942-GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB2_9
+; GFX942-GISEL-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 4, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 6, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 7, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 9, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 10, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v12, 11, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 12, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 13, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v15, 14, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v16, 15, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v18, 17, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v19, 18, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v20, 19, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v21, 20, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v22, 21, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v23, 22, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v24, 23, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v25, 24, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v26, 25, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v27, 26, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v28, 27, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v29, 28, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v30, 29, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v31, 30, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 31, v1
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v3, v4
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v5, v6
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v9, v10
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v11, v12
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v13, v14
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v13, v14
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v15, v16
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v15, v16
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v17, v18
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v17, v18
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v19, v20
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v19, v20
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v21, v22
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v21, v22
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v23, v24
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v23, v24
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v25, v26
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v25, v26
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v27, v28
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v27, v28
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v29, v30
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v29, v30
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v31, v32
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v31, v32
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v3, 0x80000000, v1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-GISEL-NEXT:  .LBB2_9: ; %Flow3
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:  .LBB2_10: ; %fp-to-i-cleanup
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cvt = fptosi float %x to i128
   ret i128 %cvt
 }
@@ -1428,6 +2558,370 @@ define i128 @fptoui_f32_to_i128(float %x) {
 ; GISEL-NEXT:  .LBB3_10: ; %fp-to-i-cleanup
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-SDAG-LABEL: fptoui_f32_to_i128:
+; GFX942-SDAG:       ; %bb.0: ; %fp-to-i-entry
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-SDAG-NEXT:    v_bfe_u32 v6, v4, 23, 8
+; GFX942-SDAG-NEXT:    s_movk_i32 s0, 0x7e
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB3_10
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; GFX942-SDAG-NEXT:    s_movk_i32 s2, 0xff7f
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
+; GFX942-SDAG-NEXT:    s_mov_b32 s3, -1
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, -1, v[2:3]
+; GFX942-SDAG-NEXT:    v_cmp_lt_i32_e64 s[0:1], -1, v4
+; GFX942-SDAG-NEXT:    s_and_b64 s[2:3], vcc, s[2:3]
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[2:3]
+; GFX942-SDAG-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB3_7
+; GFX942-SDAG-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GFX942-SDAG-NEXT:    s_mov_b32 s2, 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[8:9], v[0:1], 0, -1
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x7fffff, v4
+; GFX942-SDAG-NEXT:    s_mov_b64 s[2:3], 0x95
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v10, -1, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v11, -1, 1, s[0:1]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v4, 0x800000, v0
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[6:7]
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[2:3]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB3_4
+; GFX942-SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v7, 0xffffff6a, v6
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v0, 0xd6, v6
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v2, 0xffffff2a, v6
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[0:1], v0, v[4:5]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[2:3], v2, v[4:5]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX942-SDAG-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v7
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, v0, s[2:3]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v1, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v9, 0, v0, vcc
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v4, v10, v2
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v3, v11, v3
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v11, v2, 0
+; GFX942-SDAG-NEXT:    v_add3_u32 v1, v1, v3, v4
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], v8, v9, v[0:1]
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v0, v8, v12
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v1, v8, v9
+; GFX942-SDAG-NEXT:    v_add3_u32 v3, v1, v3, v0
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v9, v11, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[2:3], v12, v11, v[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[2:3], v9, v10, v[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v8, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v9, v5
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[8:9]
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], v12, v10, v[4:5]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr11
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr10
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX942-SDAG-NEXT:  .LBB3_4: ; %Flow
+; GFX942-SDAG-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB3_6
+; GFX942-SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v2, 0x96, v6
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v6, v0, v4, vcc
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v6, v11, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v6, v10, v[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, v5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v4
+; GFX942-SDAG-NEXT:    v_mad_i64_i32 v[2:3], s[8:9], v8, v6, v[2:3]
+; GFX942-SDAG-NEXT:  .LBB3_6: ; %Flow1
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:  .LBB3_7: ; %Flow2
+; GFX942-SDAG-NEXT:    s_andn2_saveexec_b64 s[2:3], s[6:7]
+; GFX942-SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GFX942-SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX942-SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-SDAG-NEXT:  ; %bb.9: ; %Flow3
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:  .LBB3_10: ; %fp-to-i-cleanup
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fptoui_f32_to_i128:
+; GFX942-GISEL:       ; %bb.0: ; %fp-to-i-entry
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[0:1], 23, v[4:5]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT:    v_bfe_u32 v6, v0, 0, 8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], 0x7f
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB3_10
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v1, s[4:5], 0, -1, vcc
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v9, -1
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v2, s[4:5], 0, -1, s[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[8:9]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, -1, s[4:5]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_le_u64_e32 vcc, -1, v[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_lt_i32_e64 s[0:1], -1, v4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, -1, v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB3_7
+; GFX942-GISEL-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GFX942-GISEL-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 1, v0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 2, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 3, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v9, 4, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v10, 5, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v11, 6, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v12, 7, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v13, 8, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v14, 9, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v15, 10, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v16, 11, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v17, 12, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v18, 13, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v19, 14, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v20, 15, v0
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v10
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v11
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v12
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v12
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v13
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v13
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v15
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v16
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v17
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v17
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v18
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v18
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v19
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v19
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v20
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v20
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-GISEL-NEXT:    v_lshl_or_b32 v9, v0, 16, v0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x7fffff, v4
+; GFX942-GISEL-NEXT:    v_or3_b32 v8, v1, v2, 1
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v4, 0x800000, v0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], 0x96
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB3_4
+; GFX942-GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v7, 0xffffff6a, v6
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff2a, v6
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v1, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v6, v9, 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v7, v0, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v11, v1, 0, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v6, v8, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v10, v9, v[2:3]
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v6, v6, v9
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v12, v10, v9
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], vcc, v10, v8, v[2:3]
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v7, v8, v[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v12, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v7, v9, v[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, v4, v3, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr6
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr8
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr9
+; GFX942-GISEL-NEXT:  .LBB3_4: ; %Flow
+; GFX942-GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB3_6
+; GFX942-GISEL-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v2, vcc, 0x96, v6
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v2, v0, v4, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v2, v8, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v2, v9, 0
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v4
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, v4, v5, vcc
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v4, vcc
+; GFX942-GISEL-NEXT:  .LBB3_6: ; %Flow1
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX942-GISEL-NEXT:  .LBB3_7: ; %Flow2
+; GFX942-GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB3_9
+; GFX942-GISEL-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 4, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 6, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 7, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 9, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 10, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v12, 11, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 12, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 13, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v15, 14, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v16, 15, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v18, 17, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v19, 18, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v20, 19, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v21, 20, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v22, 21, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v23, 22, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v24, 23, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v25, 24, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v26, 25, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v27, 26, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v28, 27, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v29, 28, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v30, 29, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v31, 30, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 31, v1
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v3, v4
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v5, v6
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v9, v10
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v11, v12
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v13, v14
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v13, v14
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v15, v16
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v15, v16
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v17, v18
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v17, v18
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v19, v20
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v19, v20
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v21, v22
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v21, v22
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v23, v24
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v23, v24
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v25, v26
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v25, v26
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v27, v28
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v27, v28
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v29, v30
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v29, v30
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v31, v32
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v31, v32
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v3, 0x80000000, v1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-GISEL-NEXT:  .LBB3_9: ; %Flow3
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:  .LBB3_10: ; %fp-to-i-cleanup
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cvt = fptoui float %x to i128
   ret i128 %cvt
 }
@@ -1452,6 +2946,26 @@ define i128 @fptosi_f16_to_i128(half %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v3, v1
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-SDAG-LABEL: fptosi_f16_to_i128:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX942-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fptosi_f16_to_i128:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX942-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cvt = fptosi half %x to i128
   ret i128 %cvt
 }
@@ -1466,6 +2980,16 @@ define i128 @fptoui_f16_to_i128(half %x) {
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: fptoui_f16_to_i128:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %cvt = fptoui half %x to i128
   ret i128 %cvt
 }
@@ -1813,6 +3337,362 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) {
 ; GISEL-NEXT:  .LBB6_10: ; %fp-to-i-cleanup
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-SDAG-LABEL: fptosi_bf16_to_i128:
+; GFX942-SDAG:       ; %bb.0: ; %fp-to-i-entry
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-SDAG-NEXT:    v_bfe_u32 v6, v4, 7, 8
+; GFX942-SDAG-NEXT:    s_movk_i32 s0, 0x7e
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB6_10
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; GFX942-SDAG-NEXT:    s_movk_i32 s2, 0xff7f
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
+; GFX942-SDAG-NEXT:    s_mov_b32 s3, -1
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, -1, v[2:3]
+; GFX942-SDAG-NEXT:    v_cmp_lt_i16_e64 s[0:1], -1, v4
+; GFX942-SDAG-NEXT:    s_and_b64 s[2:3], vcc, s[2:3]
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[2:3]
+; GFX942-SDAG-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB6_7
+; GFX942-SDAG-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GFX942-SDAG-NEXT:    s_movk_i32 s3, 0x7f
+; GFX942-SDAG-NEXT:    v_and_b32_sdwa v0, v4, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], 0x85
+; GFX942-SDAG-NEXT:    s_mov_b32 s2, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v9, -1, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v8, -1, 1, s[0:1]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v4, 0x80, v0
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[6:7]
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX942-SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB6_4
+; GFX942-SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, -1
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v1, 0xffffff7a, v6
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v2, 0xc6, v6
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v6, 0xffffff3a, v6
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[6:7], v6, v[4:5]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v1
+; GFX942-SDAG-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, v3, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v2, s[2:3]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[2:3], v1, v[4:5]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v3, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v2, vcc
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v1, v9, v6
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v4, v8, v7
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], v8, v6, 0
+; GFX942-SDAG-NEXT:    v_add3_u32 v3, v3, v4, v1
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], v0, v10, v[2:3]
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v1, v0, v12
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v10
+; GFX942-SDAG-NEXT:    v_add3_u32 v3, v0, v3, v1
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v10, v8, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[2:3], v12, v8, v[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[2:3], v10, v9, v[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, v7
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[10:11]
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], v12, v9, v[4:5]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr8
+; GFX942-SDAG-NEXT:  .LBB6_4: ; %Flow
+; GFX942-SDAG-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB6_6
+; GFX942-SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v2, 0x86, v6
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX942-SDAG-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v8
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-SDAG-NEXT:    v_mul_i32_i24_e32 v0, v0, v8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-SDAG-NEXT:  .LBB6_6: ; %Flow1
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:  .LBB6_7: ; %Flow2
+; GFX942-SDAG-NEXT:    s_andn2_saveexec_b64 s[2:3], s[6:7]
+; GFX942-SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GFX942-SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX942-SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-SDAG-NEXT:  ; %bb.9: ; %Flow3
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:  .LBB6_10: ; %fp-to-i-cleanup
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fptosi_bf16_to_i128:
+; GFX942-GISEL:       ; %bb.0: ; %fp-to-i-entry
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[0:1], 7, v[6:7]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT:    v_bfe_u32 v6, v0, 0, 8
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], 0x7f
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB6_10
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v1, s[4:5], 0, -1, vcc
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v9, -1
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v2, s[4:5], 0, -1, s[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[8:9]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, -1, s[4:5]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_le_u64_e32 vcc, -1, v[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_lt_i16_e64 s[0:1], -1, v4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, -1, v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB6_7
+; GFX942-GISEL-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GFX942-GISEL-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 1, v0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 2, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v5, 3, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 4, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v9, 5, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v10, 6, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v11, 7, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v12, 8, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v13, 9, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v14, 10, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v15, 11, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v16, 12, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v17, 13, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v18, 14, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v19, 15, v0
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v10
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v11
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v12
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v12
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v13
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v13
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v15
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v16
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v17
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v17
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v18
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v18
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v19
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v19
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-GISEL-NEXT:    v_lshl_or_b32 v9, v0, 16, v0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v4
+; GFX942-GISEL-NEXT:    v_or3_b32 v8, v1, v2, 1
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v4, 0x80, v0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], 0x86
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB6_4
+; GFX942-GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v7, 0xffffff7a, v6
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff3a, v6
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v1, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v6, v9, 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v7, v0, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v11, v1, 0, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v6, v8, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v10, v9, v[2:3]
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v6, v6, v9
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v12, v10, v9
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], vcc, v10, v8, v[2:3]
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v7, v8, v[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v12, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v7, v9, v[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, v4, v3, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr6
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr8
+; GFX942-GISEL-NEXT:  .LBB6_4: ; %Flow
+; GFX942-GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB6_6
+; GFX942-GISEL-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v2, vcc, 0x86, v6
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX942-GISEL-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v8
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-GISEL-NEXT:    v_mul_i32_i24_e32 v0, v0, v8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-GISEL-NEXT:  .LBB6_6: ; %Flow1
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX942-GISEL-NEXT:  .LBB6_7: ; %Flow2
+; GFX942-GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB6_9
+; GFX942-GISEL-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 4, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 6, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 7, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 9, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 10, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v12, 11, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 12, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 13, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v15, 14, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v16, 15, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v18, 17, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v19, 18, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v20, 19, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v21, 20, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v22, 21, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v23, 22, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v24, 23, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v25, 24, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v26, 25, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v27, 26, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v28, 27, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v29, 28, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v30, 29, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v31, 30, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 31, v1
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v3, v4
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v5, v6
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v9, v10
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v11, v12
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v13, v14
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v13, v14
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v15, v16
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v15, v16
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v17, v18
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v17, v18
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v19, v20
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v19, v20
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v21, v22
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v21, v22
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v23, v24
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v23, v24
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v25, v26
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v25, v26
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v27, v28
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v27, v28
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v29, v30
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v29, v30
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v31, v32
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v31, v32
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v3, 0x80000000, v1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-GISEL-NEXT:  .LBB6_9: ; %Flow3
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:  .LBB6_10: ; %fp-to-i-cleanup
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cvt = fptosi bfloat %x to i128
   ret i128 %cvt
 }
@@ -2160,6 +4040,362 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) {
 ; GISEL-NEXT:  .LBB7_10: ; %fp-to-i-cleanup
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-SDAG-LABEL: fptoui_bf16_to_i128:
+; GFX942-SDAG:       ; %bb.0: ; %fp-to-i-entry
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-SDAG-NEXT:    v_bfe_u32 v6, v4, 7, 8
+; GFX942-SDAG-NEXT:    s_movk_i32 s0, 0x7e
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-SDAG-NEXT:    v_cmp_lt_u32_e32 vcc, s0, v6
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB7_10
+; GFX942-SDAG-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GFX942-SDAG-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; GFX942-SDAG-NEXT:    s_movk_i32 s2, 0xff7f
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
+; GFX942-SDAG-NEXT:    s_mov_b32 s3, -1
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[2:3], v[0:1]
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u64_e32 vcc, -1, v[2:3]
+; GFX942-SDAG-NEXT:    v_cmp_lt_i16_e64 s[0:1], -1, v4
+; GFX942-SDAG-NEXT:    s_and_b64 s[2:3], vcc, s[2:3]
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[6:7], s[2:3]
+; GFX942-SDAG-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB7_7
+; GFX942-SDAG-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GFX942-SDAG-NEXT:    s_movk_i32 s3, 0x7f
+; GFX942-SDAG-NEXT:    v_and_b32_sdwa v0, v4, s3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX942-SDAG-NEXT:    s_mov_b64 s[8:9], 0x85
+; GFX942-SDAG-NEXT:    s_mov_b32 s2, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v9, -1, 0, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v8, -1, 1, s[0:1]
+; GFX942-SDAG-NEXT:    v_or_b32_e32 v4, 0x80, v0
+; GFX942-SDAG-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[6:7]
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-SDAG-NEXT:    s_and_saveexec_b64 s[8:9], vcc
+; GFX942-SDAG-NEXT:    s_xor_b64 s[8:9], exec, s[8:9]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB7_4
+; GFX942-SDAG-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, -1
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v1, 0xffffff7a, v6
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v2, 0xc6, v6
+; GFX942-SDAG-NEXT:    v_add_u32_e32 v6, 0xffffff3a, v6
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[6:7], v6, v[4:5]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v1
+; GFX942-SDAG-NEXT:    v_cmp_ne_u32_e64 s[2:3], 0, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v3, v7, v3, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v2, v6, v2, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v7, 0, v3, s[2:3]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v6, 0, v2, s[2:3]
+; GFX942-SDAG-NEXT:    v_lshlrev_b64 v[2:3], v1, v[4:5]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v12, 0, v3, vcc
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v10, 0, v2, vcc
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v1, v9, v6
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v4, v8, v7
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], v8, v6, 0
+; GFX942-SDAG-NEXT:    v_add3_u32 v3, v3, v4, v1
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[2:3], s[2:3], v0, v10, v[2:3]
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v1, v0, v12
+; GFX942-SDAG-NEXT:    v_mul_lo_u32 v0, v0, v10
+; GFX942-SDAG-NEXT:    v_add3_u32 v3, v0, v3, v1
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[0:1], s[2:3], v10, v8, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v1
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[2:3], v12, v8, v[4:5]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v4, v7
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[6:7], s[2:3], v10, v9, v[6:7]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v10, v7
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[10:11]
+; GFX942-SDAG-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], v12, v9, v[4:5]
+; GFX942-SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr6_vgpr7
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-SDAG-NEXT:    ; implicit-def: $vgpr8
+; GFX942-SDAG-NEXT:  .LBB7_4: ; %Flow
+; GFX942-SDAG-NEXT:    s_andn2_saveexec_b64 s[2:3], s[8:9]
+; GFX942-SDAG-NEXT:    s_cbranch_execz .LBB7_6
+; GFX942-SDAG-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GFX942-SDAG-NEXT:    v_sub_u32_e32 v2, 0x86, v6
+; GFX942-SDAG-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
+; GFX942-SDAG-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-SDAG-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX942-SDAG-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v8
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-SDAG-NEXT:    v_mul_i32_i24_e32 v0, v0, v8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-SDAG-NEXT:  .LBB7_6: ; %Flow1
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:  .LBB7_7: ; %Flow2
+; GFX942-SDAG-NEXT:    s_andn2_saveexec_b64 s[2:3], s[6:7]
+; GFX942-SDAG-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GFX942-SDAG-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX942-SDAG-NEXT:    v_bfrev_b32_e32 v1, -2
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, v0, v1, s[0:1]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-SDAG-NEXT:  ; %bb.9: ; %Flow3
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-SDAG-NEXT:  .LBB7_10: ; %fp-to-i-cleanup
+; GFX942-SDAG-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: fptoui_bf16_to_i128:
+; GFX942-GISEL:       ; %bb.0: ; %fp-to-i-entry
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[0:1], 7, v[6:7]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-GISEL-NEXT:    v_bfe_u32 v6, v0, 0, 8
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], 0x7f
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GFX942-GISEL-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB7_10
+; GFX942-GISEL-NEXT:  ; %bb.1: ; %fp-to-i-if-end
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v8, 0xffffff80
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v1, s[4:5], 0, -1, vcc
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v9, -1
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v2, s[4:5], 0, -1, s[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[0:1], v[8:9]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e64 v3, s[4:5], 0, -1, s[4:5]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_le_u64_e32 vcc, -1, v[2:3]
+; GFX942-GISEL-NEXT:    v_cmp_lt_i16_e64 s[0:1], -1, v4
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, -1, v[2:3]
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX942-GISEL-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB7_7
+; GFX942-GISEL-NEXT:  ; %bb.2: ; %fp-to-i-if-end9
+; GFX942-GISEL-NEXT:    s_xor_b64 s[6:7], s[0:1], -1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v2, 1, v0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v3, 2, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v5, 3, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v8, 4, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v9, 5, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v10, 6, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v11, 7, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v12, 8, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v13, 9, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v14, 10, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v15, 11, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v16, 12, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v17, 13, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v18, 14, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b16_e32 v19, 15, v0
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v3
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v5
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v5
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v8
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v9
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v9
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v10
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v10
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v11
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v11
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v12
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v12
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v13
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v13
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v14
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v15
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v15
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v16
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v16
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v17
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v17
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v18
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v18
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v19
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v19
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v0
+; GFX942-GISEL-NEXT:    v_lshl_or_b32 v9, v0, 16, v0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x7f, v4
+; GFX942-GISEL-NEXT:    v_or3_b32 v8, v1, v2, 1
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v4, 0x80, v0
+; GFX942-GISEL-NEXT:    v_mov_b64_e32 v[0:1], 0x86
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GFX942-GISEL-NEXT:    s_and_saveexec_b64 s[6:7], vcc
+; GFX942-GISEL-NEXT:    s_xor_b64 s[6:7], exec, s[6:7]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB7_4
+; GFX942-GISEL-NEXT:  ; %bb.3: ; %fp-to-i-if-else
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v7, 0xffffff7a, v6
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v6, 0xffffff3a, v6
+; GFX942-GISEL-NEXT:    v_sub_u32_e32 v2, 64, v7
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[0:1], v7, v[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v7
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GFX942-GISEL-NEXT:    v_lshlrev_b64 v[4:5], v6, v[4:5]
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v6, 0, v0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v1, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v2, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], s[8:9], v6, v9, 0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v7, v0, 0, vcc
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v11, v1, 0, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[0:1], s[8:9], v6, v8, 0
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v10, v9, v[2:3]
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v6, v6, v9
+; GFX942-GISEL-NEXT:    v_mul_lo_u32 v12, v10, v9
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[2:3], vcc, v10, v8, v[2:3]
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[4:5], s[8:9], v7, v8, v[4:5]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v6, vcc, v6, v12, vcc
+; GFX942-GISEL-NEXT:    v_add_co_u32_e32 v1, vcc, v1, v2
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v7, v9, v[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, v4, v3, vcc
+; GFX942-GISEL-NEXT:    v_mad_u64_u32 v[6:7], s[8:9], v11, v8, v[6:7]
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, v5, v6, vcc
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr6
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GFX942-GISEL-NEXT:    ; implicit-def: $vgpr8
+; GFX942-GISEL-NEXT:  .LBB7_4: ; %Flow
+; GFX942-GISEL-NEXT:    s_andn2_saveexec_b64 s[6:7], s[6:7]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB7_6
+; GFX942-GISEL-NEXT:  ; %bb.5: ; %fp-to-i-if-then12
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v2, vcc, 0x86, v6
+; GFX942-GISEL-NEXT:    v_lshrrev_b64 v[0:1], v2, v[4:5]
+; GFX942-GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-GISEL-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
+; GFX942-GISEL-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v8
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-GISEL-NEXT:    v_mul_i32_i24_e32 v0, v0, v8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-GISEL-NEXT:  .LBB7_6: ; %Flow1
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
+; GFX942-GISEL-NEXT:  .LBB7_7: ; %Flow2
+; GFX942-GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[4:5]
+; GFX942-GISEL-NEXT:    s_cbranch_execz .LBB7_9
+; GFX942-GISEL-NEXT:  ; %bb.8: ; %fp-to-i-if-then5
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v1, 0, -1, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 1, v1
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 3, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 4, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 5, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 6, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 7, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 8, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 9, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v11, 10, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v12, 11, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v13, 12, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v14, 13, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v15, 14, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v16, 15, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v17, 16, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v18, 17, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v19, 18, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v20, 19, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v21, 20, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v22, 21, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v23, 22, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v24, 23, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v25, 24, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v26, 25, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v27, 26, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v28, 27, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v29, 28, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v30, 29, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v31, 30, v1
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 31, v1
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v0, v2
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v3, v4
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v3, v4
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v5, v6
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v5, v6
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v7, v8
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v7, v8
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v9, v10
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v9, v10
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v11, v12
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v11, v12
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v13, v14
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v13, v14
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v15, v16
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v15, v16
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v17, v18
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v17, v18
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v19, v20
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v19, v20
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v21, v22
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v21, v22
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v23, v24
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v23, v24
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v25, v26
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v25, v26
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v27, v28
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v27, v28
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v29, v30
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v29, v30
+; GFX942-GISEL-NEXT:    v_or3_b32 v1, v1, v31, v32
+; GFX942-GISEL-NEXT:    v_or3_b32 v0, v0, v31, v32
+; GFX942-GISEL-NEXT:    v_add_u32_e32 v3, 0x80000000, v1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-GISEL-NEXT:  .LBB7_9: ; %Flow3
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-GISEL-NEXT:  .LBB7_10: ; %fp-to-i-cleanup
+; GFX942-GISEL-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %cvt = fptoui bfloat %x to i128
   ret i128 %cvt
 }
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 37756d15861be..fa0116f263558 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -1,12 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX942-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
@@ -84,6 +86,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX9-NEXT:  .LBB0_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB0_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX942-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f32 v0, v1, s[2:3]
+; GFX942-NEXT:  .LBB0_2:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
@@ -255,6 +276,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX9-DPP-NEXT:  .LBB0_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB0_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX942-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f32 v0, v1, s[2:3]
+; GFX942-DPP-NEXT:  .LBB0_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
@@ -502,6 +542,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:  .LBB1_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-NEXT:  .LBB1_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f32_e32 v1, s4, v1
+; GFX942-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB1_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX942-NEXT:  .LBB1_4:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -863,6 +948,69 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:  .LBB1_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b32 s0, s2
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB1_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v4, s0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f32 v0, v4, s[2:3]
+; GFX942-DPP-NEXT:  .LBB1_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -1223,6 +1371,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX9-NEXT:  .LBB2_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    scratch_store_dword off, v0, off
+; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB2_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_mov_b32 s0, 0
+; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s1, 0xc3300000
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], s[0:1]
+; GFX942-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX942-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f32 v1, v0, s[2:3] sc1
+; GFX942-NEXT:  .LBB2_2:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
@@ -1443,6 +1618,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_one_as_scope
 ; GFX9-DPP-NEXT:  .LBB2_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off
+; GFX942-DPP-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB2_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b32 s1, 0xc3300000
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[0:1]
+; GFX942-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX942-DPP-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[2:3] sc1
+; GFX942-DPP-NEXT:  .LBB2_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], exec
@@ -1720,6 +1922,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:  .LBB3_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f32_e32 v1, s4, v1
+; GFX942-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB3_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f32 v0, v1, s[0:1] sc1
+; GFX942-NEXT:  .LBB3_4:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -2081,6 +2328,69 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:  .LBB3_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b32 s0, s2
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB3_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v4, s0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f32 v0, v4, s[2:3] sc1
+; GFX942-DPP-NEXT:  .LBB3_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -2441,6 +2751,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX9-NEXT:  .LBB4_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    scratch_store_dword off, v0, off
+; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB4_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_mov_b32 s0, 0
+; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s1, 0xc3300000
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], s[0:1]
+; GFX942-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX942-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f32 v1, v0, s[2:3]
+; GFX942-NEXT:  .LBB4_2:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
@@ -2691,6 +3028,33 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_
 ; GFX9-DPP-NEXT:  .LBB4_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off
+; GFX942-DPP-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB4_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_mov_b32 s0, 0
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b32 s1, 0xc3300000
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[0:1]
+; GFX942-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX942-DPP-NEXT:    v_mul_f32_e32 v0, 4.0, v0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f32 v1, v0, s[2:3]
+; GFX942-DPP-NEXT:  .LBB4_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], exec
@@ -2998,6 +3362,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:  .LBB5_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f32_e32 v1, s4, v1
+; GFX942-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB5_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX942-NEXT:  .LBB5_4:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -3359,6 +3768,69 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:  .LBB5_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b32 s0, s2
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB5_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v4, s0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f32 v0, v4, s[2:3]
+; GFX942-DPP-NEXT:  .LBB5_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -3772,6 +4244,51 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:  .LBB6_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f32_e32 v1, s4, v1
+; GFX942-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB6_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f32 v0, v1, s[0:1]
+; GFX942-NEXT:  .LBB6_4:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -4133,6 +4650,69 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:  .LBB6_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v1, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b32 s0, s2
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB6_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v4, s0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f32 v0, v4, s[2:3]
+; GFX942-DPP-NEXT:  .LBB6_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -4493,6 +5073,46 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
 ; GFX9-NEXT:  .LBB7_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    scratch_store_dword off, v0, off
+; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB7_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s4, 0
+; GFX942-NEXT:    s_mov_b32 s5, 0xc3300000
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
+; GFX942-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-NEXT:  .LBB7_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX942-NEXT:  .LBB7_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
@@ -4743,6 +5363,46 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_default_scop
 ; GFX9-DPP-NEXT:  .LBB7_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off
+; GFX942-DPP-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB7_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b32 s4, 0
+; GFX942-DPP-NEXT:    s_mov_b32 s5, 0xc3300000
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
+; GFX942-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX942-DPP-NEXT:  .LBB7_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_default_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], exec
@@ -5049,6 +5709,63 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX9-NEXT:  .LBB8_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX942-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX942-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB8_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX942-NEXT:  .LBB8_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f32_e32 v0, v1, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB8_4
+; GFX942-NEXT:  .LBB8_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -5436,6 +6153,79 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:  .LBB8_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dword v5, v0, s[0:1]
+; GFX942-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f32_e32 v4, s4, v5
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v4, v0, v[4:5], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX942-DPP-NEXT:  .LBB8_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_div_value_default_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -5808,6 +6598,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX9-NEXT:  .LBB9_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB9_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
+; GFX942-NEXT:  .LBB9_2:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
@@ -6022,6 +6831,25 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:  .LBB9_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB9_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX942-DPP-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f64 v2, v[0:1], s[2:3]
+; GFX942-DPP-NEXT:  .LBB9_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
@@ -6314,6 +7142,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:  .LBB10_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-NEXT:  .LBB10_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX942-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f64 v[2:3], v[2:3], s[2:3]
+; GFX942-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB10_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f64 v0, v[2:3], s[0:1]
+; GFX942-NEXT:  .LBB10_4:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -6733,6 +7608,82 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:  .LBB10_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v9
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB10_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f64 v8, v[0:1], s[2:3]
+; GFX942-DPP-NEXT:  .LBB10_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -7173,6 +8124,32 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
 ; GFX9-NEXT:  .LBB11_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    scratch_store_dword off, v0, off
+; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB11_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX942-NEXT:    v_mul_f64 v[0:1], 4.0, v[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX942-NEXT:  .LBB11_2:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
@@ -7430,6 +8407,32 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_a
 ; GFX9-DPP-NEXT:  .LBB11_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off
+; GFX942-DPP-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB11_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX942-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_mul_f64 v[0:1], 4.0, v[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX942-DPP-NEXT:  .LBB11_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], exec
@@ -7747,6 +8750,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX9-NEXT:  .LBB12_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-NEXT:  .LBB12_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX942-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f64 v[2:3], v[2:3], s[2:3]
+; GFX942-NEXT:    s_cbranch_scc1 .LBB12_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB12_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f64 v0, v[2:3], s[0:1] sc1
+; GFX942-NEXT:  .LBB12_4:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -8166,6 +9216,82 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX9-DPP-NEXT:  .LBB12_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v9
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB12_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f64 v8, v[0:1], s[2:3] sc1
+; GFX942-DPP-NEXT:  .LBB12_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -8606,6 +9732,32 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX9-NEXT:  .LBB13_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    scratch_store_dword off, v0, off
+; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB13_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX942-NEXT:    v_mul_f64 v[0:1], 4.0, v[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX942-NEXT:  .LBB13_2:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
@@ -8863,6 +10015,32 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:  .LBB13_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off
+; GFX942-DPP-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB13_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX942-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_mul_f64 v[0:1], 4.0, v[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1]
+; GFX942-DPP-NEXT:  .LBB13_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], exec
@@ -9180,6 +10358,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:  .LBB14_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-NEXT:  .LBB14_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX942-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f64 v[2:3], v[2:3], s[2:3]
+; GFX942-NEXT:    s_cbranch_scc1 .LBB14_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB14_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f64 v0, v[2:3], s[0:1]
+; GFX942-NEXT:  .LBB14_4:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -9599,6 +10824,82 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:  .LBB14_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v9
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB14_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f64 v8, v[0:1], s[2:3]
+; GFX942-DPP-NEXT:  .LBB14_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -10095,6 +11396,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:  .LBB15_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-NEXT:  .LBB15_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX942-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f64 v[2:3], v[2:3], s[2:3]
+; GFX942-NEXT:    s_cbranch_scc1 .LBB15_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB15_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f64 v0, v[2:3], s[0:1]
+; GFX942-NEXT:  .LBB15_4:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -10514,6 +11862,82 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:  .LBB15_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v9
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB15_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f64 v8, v[0:1], s[2:3]
+; GFX942-DPP-NEXT:  .LBB15_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -10954,6 +12378,32 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX9-NEXT:  .LBB16_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    scratch_store_dword off, v0, off
+; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB16_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s2, 0
+; GFX942-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX942-NEXT:    v_mul_f64 v[0:1], 4.0, v[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX942-NEXT:  .LBB16_2:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
@@ -11211,6 +12661,32 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:  .LBB16_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off
+; GFX942-DPP-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB16_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b32 s2, 0
+; GFX942-DPP-NEXT:    s_mov_b32 s3, 0xc3300000
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_mul_f64 v[0:1], 4.0, v[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
+; GFX942-DPP-NEXT:  .LBB16_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], exec
@@ -11528,6 +13004,53 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-NEXT:  .LBB17_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-NEXT:  .LBB17_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX942-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f64 v[2:3], v[2:3], s[2:3]
+; GFX942-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB17_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f64 v0, v[2:3], s[0:1] sc1
+; GFX942-NEXT:  .LBB17_4:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -11947,6 +13470,82 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:  .LBB17_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v9
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB17_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f64 v8, v[0:1], s[2:3] sc1
+; GFX942-DPP-NEXT:  .LBB17_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -12363,6 +13962,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX9-NEXT:  .LBB18_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB18_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX942-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f32 v0, v1, s[2:3] sc1
+; GFX942-NEXT:  .LBB18_2:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
@@ -12534,6 +14152,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX9-DPP-NEXT:  .LBB18_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB18_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX942-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f32 v0, v1, s[2:3] sc1
+; GFX942-DPP-NEXT:  .LBB18_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
@@ -12709,6 +14346,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX9-NEXT:  .LBB19_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB19_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX942-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_add_f32 v0, v1, s[2:3] sc1
+; GFX942-NEXT:  .LBB19_2:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
@@ -12880,6 +14536,25 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_system_scope
 ; GFX9-DPP-NEXT:  .LBB19_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB19_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s0, s[0:1]
+; GFX942-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v1, s0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    v_mul_f32_e32 v1, 4.0, v1
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_add_f32 v0, v1, s[2:3] sc1
+; GFX942-DPP-NEXT:  .LBB19_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fadd_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 6351bb39e97f5..9e87dfcef6103 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -1,12 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32, -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX942-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
@@ -78,6 +80,35 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
 ; GFX9-NEXT:  .LBB0_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB0_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NEXT:  .LBB0_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX942-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX942-NEXT:  .LBB0_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -203,6 +234,35 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_
 ; GFX9-DPP-NEXT:  .LBB0_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX942-DPP-NEXT:  .LBB0_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -418,6 +478,67 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:  .LBB1_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX942-NEXT:  .LBB1_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX942-NEXT:    v_max_f32_e64 v2, s4, s4
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX942-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB1_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX942-NEXT:  .LBB1_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB1_4
+; GFX942-NEXT:  .LBB1_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -776,6 +897,88 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:  .LBB1_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dword v5, v0, s[0:1]
+; GFX942-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX942-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v4, v0, v[4:5], s[0:1] sc0
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB1_2
+; GFX942-DPP-NEXT:  .LBB1_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -1116,6 +1319,35 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 ; GFX9-NEXT:  .LBB2_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB2_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NEXT:  .LBB2_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX942-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX942-NEXT:  .LBB2_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1241,6 +1473,35 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_one_as_scope
 ; GFX9-DPP-NEXT:  .LBB2_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX942-DPP-NEXT:  .LBB2_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1457,6 +1718,67 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:  .LBB3_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX942-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX942-NEXT:    v_max_f32_e64 v2, s4, s4
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX942-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB3_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX942-NEXT:  .LBB3_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX942-NEXT:  .LBB3_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -1815,6 +2137,88 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:  .LBB3_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dword v5, v0, s[0:1]
+; GFX942-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX942-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v4, v0, v[4:5], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX942-DPP-NEXT:  .LBB3_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_one_as_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -2156,6 +2560,35 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
 ; GFX9-NEXT:  .LBB4_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB4_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NEXT:  .LBB4_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX942-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX942-NEXT:  .LBB4_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2281,6 +2714,35 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_default_scop
 ; GFX9-DPP-NEXT:  .LBB4_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX942-DPP-NEXT:  .LBB4_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_default_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2496,6 +2958,67 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX9-NEXT:  .LBB5_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX942-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX942-NEXT:    v_max_f32_e64 v2, s4, s4
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX942-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB5_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX942-NEXT:  .LBB5_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_max_f32_e32 v0, v0, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX942-NEXT:  .LBB5_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -2854,6 +3377,88 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:  .LBB5_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v1, v2, v1
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dword v5, v0, s[0:1]
+; GFX942-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX942-DPP-NEXT:    v_max_f32_e32 v4, v4, v6
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v4, v0, v[4:5], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX942-DPP-NEXT:  .LBB5_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_div_value_default_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -3200,6 +3805,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX9-NEXT:  .LBB6_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB6_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 4.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX942-NEXT:  .LBB6_2:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3366,6 +3987,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:  .LBB6_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB6_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], 4.0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_max_f64 v2, v[0:1], s[0:1]
+; GFX942-DPP-NEXT:  .LBB6_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3624,6 +4261,55 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-NEXT:  .LBB7_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX942-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX942-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX942-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX942-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB7_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_max_f64 v0, v[2:3], s[0:1]
+; GFX942-NEXT:  .LBB7_4:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -4043,6 +4729,89 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:  .LBB7_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v9
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX942-DPP-NEXT:    v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB7_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_max_f64 v8, v[0:1], s[2:3]
+; GFX942-DPP-NEXT:  .LBB7_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -4468,6 +5237,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
 ; GFX9-NEXT:  .LBB8_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB8_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 4.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_max_f64 v2, v[0:1], s[0:1] sc1
+; GFX942-NEXT:  .LBB8_2:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4634,6 +5419,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_a
 ; GFX9-DPP-NEXT:  .LBB8_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB8_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], 4.0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_max_f64 v2, v[0:1], s[0:1] sc1
+; GFX942-DPP-NEXT:  .LBB8_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4892,6 +5693,55 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX9-NEXT:  .LBB9_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX942-NEXT:  .LBB9_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX942-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX942-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX942-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB9_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_max_f64 v0, v[2:3], s[0:1] sc1
+; GFX942-NEXT:  .LBB9_4:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -5311,6 +6161,89 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX9-DPP-NEXT:  .LBB9_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v9
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX942-DPP-NEXT:    v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB9_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_max_f64 v8, v[0:1], s[2:3] sc1
+; GFX942-DPP-NEXT:  .LBB9_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -5736,6 +6669,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX9-NEXT:  .LBB10_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB10_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 4.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_max_f64 v2, v[0:1], s[0:1] sc1
+; GFX942-NEXT:  .LBB10_2:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5902,6 +6851,22 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:  .LBB10_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB10_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], 4.0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_max_f64 v2, v[0:1], s[0:1] sc1
+; GFX942-DPP-NEXT:  .LBB10_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6160,6 +7125,55 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-NEXT:  .LBB11_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX942-NEXT:  .LBB11_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX942-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX942-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
+; GFX942-NEXT:    s_cbranch_scc1 .LBB11_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB11_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_max_f64 v0, v[2:3], s[0:1] sc1
+; GFX942-NEXT:  .LBB11_4:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -6579,6 +7593,89 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:  .LBB11_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v9
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX942-DPP-NEXT:    v_max_f64 v[2:3], v[4:5], v[2:3]
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB11_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_max_f64 v8, v[0:1], s[2:3] sc1
+; GFX942-DPP-NEXT:  .LBB11_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -6998,6 +8095,35 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX9-NEXT:  .LBB12_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB12_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NEXT:  .LBB12_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX942-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB12_2
+; GFX942-NEXT:  .LBB12_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7123,6 +8249,35 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX9-DPP-NEXT:  .LBB12_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB12_2
+; GFX942-DPP-NEXT:  .LBB12_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7252,6 +8407,35 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX9-NEXT:  .LBB13_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB13_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NEXT:  .LBB13_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX942-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX942-NEXT:  .LBB13_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7377,6 +8561,35 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_system_scope
 ; GFX9-DPP-NEXT:  .LBB13_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v0, 4.0, v0
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX942-DPP-NEXT:  .LBB13_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmax_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index a9ac00863cd17..04e015642982a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -1,12 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX942-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
@@ -78,6 +80,35 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
 ; GFX9-NEXT:  .LBB0_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB0_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NEXT:  .LBB0_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX942-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX942-NEXT:  .LBB0_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -203,6 +234,35 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_
 ; GFX9-DPP-NEXT:  .LBB0_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX942-DPP-NEXT:  .LBB0_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -418,6 +478,67 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:  .LBB1_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX942-NEXT:  .LBB1_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX942-NEXT:    v_max_f32_e64 v2, s4, s4
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX942-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB1_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX942-NEXT:  .LBB1_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB1_4
+; GFX942-NEXT:  .LBB1_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -776,6 +897,88 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:  .LBB1_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX942-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dword v5, v0, s[0:1]
+; GFX942-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX942-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v4, v0, v[4:5], s[0:1] sc0
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB1_2
+; GFX942-DPP-NEXT:  .LBB1_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -1116,6 +1319,35 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 ; GFX9-NEXT:  .LBB2_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB2_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NEXT:  .LBB2_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX942-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX942-NEXT:  .LBB2_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1241,6 +1473,35 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_one_as_scope
 ; GFX9-DPP-NEXT:  .LBB2_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX942-DPP-NEXT:  .LBB2_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1457,6 +1718,67 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:  .LBB3_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX942-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX942-NEXT:    v_max_f32_e64 v2, s4, s4
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX942-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB3_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX942-NEXT:  .LBB3_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX942-NEXT:  .LBB3_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -1815,6 +2137,88 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:  .LBB3_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX942-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dword v5, v0, s[0:1]
+; GFX942-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX942-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v4, v0, v[4:5], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX942-DPP-NEXT:  .LBB3_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_div_value_one_as_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -2156,6 +2560,35 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
 ; GFX9-NEXT:  .LBB4_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB4_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NEXT:  .LBB4_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX942-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX942-NEXT:  .LBB4_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2281,6 +2714,35 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_default_scop
 ; GFX9-DPP-NEXT:  .LBB4_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX942-DPP-NEXT:  .LBB4_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_default_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2496,6 +2958,67 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX9-NEXT:  .LBB5_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX942-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942-NEXT:    v_max_f32_e32 v1, v2, v2
+; GFX942-NEXT:    v_max_f32_e64 v2, s4, s4
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX942-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB5_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX942-NEXT:  .LBB5_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_min_f32_e32 v0, v0, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX942-NEXT:  .LBB5_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -2854,6 +3377,88 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:  .LBB5_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, 0x7fc00000
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v2, v2, v2
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v3, v3, v3
+; GFX942-DPP-NEXT:    v_min_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f32_e32 v1, v1, v1
+; GFX942-DPP-NEXT:    v_min_f32_e32 v1, v2, v1
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_max_f32_e64 v6, s4, s4
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dword v5, v0, s[0:1]
+; GFX942-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_max_f32_e32 v4, v5, v5
+; GFX942-DPP-NEXT:    v_min_f32_e32 v4, v4, v6
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v4, v0, v[4:5], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX942-DPP-NEXT:  .LBB5_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_div_value_default_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -3200,6 +3805,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX9-NEXT:  .LBB6_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB6_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 4.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX942-NEXT:  .LBB6_2:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3366,6 +3987,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:  .LBB6_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB6_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], 4.0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_min_f64 v2, v[0:1], s[0:1]
+; GFX942-DPP-NEXT:  .LBB6_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3624,6 +4261,55 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-NEXT:  .LBB7_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX942-NEXT:  .LBB7_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX942-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX942-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX942-NEXT:    s_cbranch_scc1 .LBB7_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB7_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_min_f64 v0, v[2:3], s[0:1]
+; GFX942-NEXT:  .LBB7_4:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -4043,6 +4729,89 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:  .LBB7_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v9
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX942-DPP-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB7_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_min_f64 v8, v[0:1], s[2:3]
+; GFX942-DPP-NEXT:  .LBB7_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -4468,6 +5237,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
 ; GFX9-NEXT:  .LBB8_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB8_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 4.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_min_f64 v2, v[0:1], s[0:1] sc1
+; GFX942-NEXT:  .LBB8_2:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4634,6 +5419,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_a
 ; GFX9-DPP-NEXT:  .LBB8_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB8_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], 4.0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_min_f64 v2, v[0:1], s[0:1] sc1
+; GFX942-DPP-NEXT:  .LBB8_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4892,6 +5693,55 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX9-NEXT:  .LBB9_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX942-NEXT:  .LBB9_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX942-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX942-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX942-NEXT:    s_cbranch_scc1 .LBB9_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB9_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_min_f64 v0, v[2:3], s[0:1] sc1
+; GFX942-NEXT:  .LBB9_4:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -5311,6 +6161,89 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX9-DPP-NEXT:  .LBB9_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v9
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX942-DPP-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB9_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_min_f64 v8, v[0:1], s[2:3] sc1
+; GFX942-DPP-NEXT:  .LBB9_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -5736,6 +6669,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX9-NEXT:  .LBB10_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB10_2
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 4.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_min_f64 v2, v[0:1], s[0:1] sc1
+; GFX942-NEXT:  .LBB10_2:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5902,6 +6851,22 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:  .LBB10_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB10_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], 4.0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_min_f64 v2, v[0:1], s[0:1] sc1
+; GFX942-DPP-NEXT:  .LBB10_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_default_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6160,6 +7125,55 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-NEXT:  .LBB11_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX942-NEXT:  .LBB11_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX942-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX942-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
+; GFX942-NEXT:    s_cbranch_scc1 .LBB11_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB11_4
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_atomic_min_f64 v0, v[2:3], s[0:1] sc1
+; GFX942-NEXT:  .LBB11_4:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -6579,6 +7593,89 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:  .LBB11_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v9, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v9, exec_hi, v9
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0x7ff80000
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[4:5], v[4:5], v[4:5]
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[6:7], v[6:7], v[6:7]
+; GFX942-DPP-NEXT:    v_min_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_max_f64 v[2:3], v[2:3], v[2:3]
+; GFX942-DPP-NEXT:    v_min_f64 v[2:3], v[4:5], v[2:3]
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v9
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB11_2
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_atomic_min_f64 v8, v[0:1], s[2:3] sc1
+; GFX942-DPP-NEXT:  .LBB11_2:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_default_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -6998,6 +8095,35 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX9-NEXT:  .LBB12_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB12_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NEXT:  .LBB12_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX942-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB12_2
+; GFX942-NEXT:  .LBB12_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7123,6 +8249,35 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX9-DPP-NEXT:  .LBB12_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB12_2
+; GFX942-DPP-NEXT:  .LBB12_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_ignore_denormal_mode__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7252,6 +8407,35 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX9-NEXT:  .LBB13_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB13_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NEXT:  .LBB13_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX942-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX942-NEXT:  .LBB13_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7377,6 +8561,35 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_system_scope
 ; GFX9-DPP-NEXT:  .LBB13_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_max_f32_e32 v0, v1, v1
+; GFX942-DPP-NEXT:    v_min_f32_e32 v0, 4.0, v0
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v0, v2, v[0:1], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX942-DPP-NEXT:  .LBB13_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fmin_uni_address_uni_value_system_scope__amdgpu_no_fine_grained_memory__amdgpu_no_remote_memory:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 6311143f57260..58fae702f08eb 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -1,12 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=Iterative < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132 %s
 ; RUN: llc -mtriple=amdgcn -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefix=GFX7LESS-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX9-DPP %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX942-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1064-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize32 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1032-DPP %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP < %s | FileCheck -enable-var-scope -check-prefixes=GFX1164-DPP %s
@@ -84,6 +86,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX9-NEXT:  .LBB0_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB0_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_bcnt1_i32_b64 s5, s[2:3]
+; GFX942-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NEXT:  .LBB0_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX942-NEXT:  .LBB0_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
@@ -285,6 +319,38 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX9-DPP-NEXT:  .LBB0_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB0_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s5, s[2:3]
+; GFX942-DPP-NEXT:    v_cvt_f32_ubyte0_e32 v0, s5
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-DPP-NEXT:  .LBB0_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB0_2
+; GFX942-DPP-NEXT:  .LBB0_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
@@ -562,6 +628,63 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:  .LBB1_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX942-NEXT:  .LBB1_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX942-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB1_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX942-NEXT:  .LBB1_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB1_4
+; GFX942-NEXT:  .LBB1_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -949,6 +1072,79 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:  .LBB1_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB1_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dword v5, v0, s[0:1]
+; GFX942-DPP-NEXT:  .LBB1_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_subrev_f32_e32 v4, s4, v5
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v4, v0, v[4:5], s[0:1] sc0
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB1_2
+; GFX942-DPP-NEXT:  .LBB1_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -1335,6 +1531,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX9-NEXT:  .LBB2_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    scratch_store_dword off, v0, off
+; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB2_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s4, 0
+; GFX942-NEXT:    s_mov_b32 s5, 0xc3300000
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
+; GFX942-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-NEXT:  .LBB2_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX942-NEXT:  .LBB2_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
@@ -1585,6 +1821,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_one_as_scope
 ; GFX9-DPP-NEXT:  .LBB2_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off
+; GFX942-DPP-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB2_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b32 s4, 0
+; GFX942-DPP-NEXT:    s_mov_b32 s5, 0xc3300000
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
+; GFX942-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-DPP-NEXT:  .LBB2_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB2_2
+; GFX942-DPP-NEXT:  .LBB2_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], exec
@@ -1892,6 +2168,63 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:  .LBB3_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX942-NEXT:  .LBB3_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX942-NEXT:    s_cbranch_scc1 .LBB3_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB3_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX942-NEXT:  .LBB3_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB3_4
+; GFX942-NEXT:  .LBB3_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -2279,6 +2612,79 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-DPP-NEXT:  .LBB3_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB3_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dword v5, v0, s[0:1]
+; GFX942-DPP-NEXT:  .LBB3_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_subrev_f32_e32 v4, s4, v5
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v4, v0, v[4:5], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB3_2
+; GFX942-DPP-NEXT:  .LBB3_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -2665,6 +3071,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX9-NEXT:  .LBB4_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    scratch_store_dword off, v0, off
+; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB4_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s4, 0
+; GFX942-NEXT:    s_mov_b32 s5, 0xc3300000
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
+; GFX942-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-NEXT:  .LBB4_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX942-NEXT:  .LBB4_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
@@ -2915,6 +3361,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_
 ; GFX9-DPP-NEXT:  .LBB4_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off
+; GFX942-DPP-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB4_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b32 s4, 0
+; GFX942-DPP-NEXT:    s_mov_b32 s5, 0xc3300000
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
+; GFX942-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-DPP-NEXT:  .LBB4_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB4_2
+; GFX942-DPP-NEXT:  .LBB4_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], exec
@@ -3222,6 +3708,63 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:  .LBB5_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX942-NEXT:  .LBB5_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX942-NEXT:    s_cbranch_scc1 .LBB5_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB5_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX942-NEXT:  .LBB5_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB5_4
+; GFX942-NEXT:  .LBB5_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -3609,6 +4152,79 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:  .LBB5_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB5_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dword v5, v0, s[0:1]
+; GFX942-DPP-NEXT:  .LBB5_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_subrev_f32_e32 v4, s4, v5
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v4, v0, v[4:5], s[0:1] sc0
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB5_2
+; GFX942-DPP-NEXT:  .LBB5_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -4048,6 +4664,63 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:  .LBB6_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX942-NEXT:  .LBB6_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX942-NEXT:    s_cbranch_scc1 .LBB6_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB6_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX942-NEXT:  .LBB6_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB6_4
+; GFX942-NEXT:  .LBB6_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -4435,6 +5108,79 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-DPP-NEXT:  .LBB6_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB6_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dword v5, v0, s[0:1]
+; GFX942-DPP-NEXT:  .LBB6_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_subrev_f32_e32 v4, s4, v5
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v4, v0, v[4:5], s[0:1] sc0
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB6_2
+; GFX942-DPP-NEXT:  .LBB6_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -4821,6 +5567,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
 ; GFX9-NEXT:  .LBB7_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    scratch_store_dword off, v0, off
+; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB7_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s4, 0
+; GFX942-NEXT:    s_mov_b32 s5, 0xc3300000
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
+; GFX942-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-NEXT:  .LBB7_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX942-NEXT:  .LBB7_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
@@ -5071,6 +5857,46 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_default_scop
 ; GFX9-DPP-NEXT:  .LBB7_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off
+; GFX942-DPP-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB7_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b32 s4, 0
+; GFX942-DPP-NEXT:    s_mov_b32 s5, 0xc3300000
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[4:5]
+; GFX942-DPP-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dword s6, s[0:1], 0x0
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_mul_f32_e32 v2, 4.0, v0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-DPP-NEXT:  .LBB7_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB7_2
+; GFX942-DPP-NEXT:  .LBB7_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_uni_value_default_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], exec
@@ -5377,6 +6203,63 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX9-NEXT:  .LBB8_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v2, 1
+; GFX942-NEXT:  .LBB8_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX942-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f32_e32 v2, s4, v2
+; GFX942-NEXT:    s_cbranch_scc1 .LBB8_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB8_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v3, s[0:1]
+; GFX942-NEXT:  .LBB8_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_sub_f32_e32 v0, v1, v2
+; GFX942-NEXT:    global_atomic_cmpswap v0, v3, v[0:1], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB8_4
+; GFX942-NEXT:  .LBB8_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -5764,6 +6647,79 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX9-DPP-NEXT:  .LBB8_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v4, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v4, exec_hi, v4
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v2, v1, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v2 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v2, v2, v3
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v1, v2 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s4, v1, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v4
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB8_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dword v5, v0, s[0:1]
+; GFX942-DPP-NEXT:  .LBB8_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_subrev_f32_e32 v4, s4, v5
+; GFX942-DPP-NEXT:    global_atomic_cmpswap v4, v0, v[4:5], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v5
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB8_2
+; GFX942-DPP-NEXT:  .LBB8_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_uni_address_div_value_default_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -6136,6 +7092,38 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX9-NEXT:  .LBB9_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB9_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[0:1], s6
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    v_mul_f64 v[4:5], v[0:1], 4.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-NEXT:  .LBB9_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX942-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB9_2
+; GFX942-NEXT:  .LBB9_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[2:3], exec
@@ -6350,6 +7338,38 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:  .LBB9_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB9_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s6, s[2:3]
+; GFX942-DPP-NEXT:    v_cvt_f64_u32_e32 v[0:1], s6
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    v_mul_f64 v[4:5], v[0:1], 4.0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-DPP-NEXT:  .LBB9_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX942-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] sc0
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB9_2
+; GFX942-DPP-NEXT:  .LBB9_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[2:3], exec
@@ -6642,6 +7662,65 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:  .LBB10_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX942-NEXT:  .LBB10_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX942-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX942-NEXT:    s_cbranch_scc1 .LBB10_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB10_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX942-NEXT:  .LBB10_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX942-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB10_4
+; GFX942-NEXT:  .LBB10_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -7061,6 +8140,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:  .LBB10_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v8
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB10_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dwordx2 v[10:11], v12, s[2:3]
+; GFX942-DPP-NEXT:  .LBB10_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], -s[0:1]
+; GFX942-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v12, v[8:11], s[2:3] sc0
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX942-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB10_2
+; GFX942-DPP-NEXT:  .LBB10_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -7501,6 +8667,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
 ; GFX9-NEXT:  .LBB11_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    scratch_store_dword off, v0, off
+; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB11_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s6, 0
+; GFX942-NEXT:    s_mov_b32 s7, 0xc3300000
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX942-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-NEXT:  .LBB11_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX942-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB11_2
+; GFX942-NEXT:  .LBB11_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
@@ -7758,6 +8963,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_a
 ; GFX9-DPP-NEXT:  .LBB11_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off
+; GFX942-DPP-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB11_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b32 s6, 0
+; GFX942-DPP-NEXT:    s_mov_b32 s7, 0xc3300000
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX942-DPP-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-DPP-NEXT:  .LBB11_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX942-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB11_2
+; GFX942-DPP-NEXT:  .LBB11_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], exec
@@ -8074,6 +9318,65 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX9-NEXT:  .LBB12_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX942-NEXT:  .LBB12_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX942-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX942-NEXT:    s_cbranch_scc1 .LBB12_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB12_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX942-NEXT:  .LBB12_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX942-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB12_4
+; GFX942-NEXT:  .LBB12_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -8493,6 +9796,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX9-DPP-NEXT:  .LBB12_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v8
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB12_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dwordx2 v[10:11], v12, s[2:3]
+; GFX942-DPP-NEXT:  .LBB12_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], -s[0:1]
+; GFX942-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v12, v[8:11], s[2:3] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX942-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB12_2
+; GFX942-DPP-NEXT:  .LBB12_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -8933,6 +10323,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX9-NEXT:  .LBB13_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    scratch_store_dword off, v0, off
+; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB13_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s6, 0
+; GFX942-NEXT:    s_mov_b32 s7, 0xc3300000
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX942-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-NEXT:  .LBB13_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX942-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX942-NEXT:  .LBB13_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
@@ -9190,6 +10619,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent
 ; GFX9-DPP-NEXT:  .LBB13_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off
+; GFX942-DPP-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB13_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b32 s6, 0
+; GFX942-DPP-NEXT:    s_mov_b32 s7, 0xc3300000
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX942-DPP-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-DPP-NEXT:  .LBB13_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX942-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] sc0
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB13_2
+; GFX942-DPP-NEXT:  .LBB13_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], exec
@@ -9507,6 +10975,65 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:  .LBB14_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX942-NEXT:  .LBB14_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX942-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX942-NEXT:    s_cbranch_scc1 .LBB14_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB14_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX942-NEXT:  .LBB14_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX942-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB14_4
+; GFX942-NEXT:  .LBB14_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -9926,6 +11453,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:  .LBB14_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.double.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.double.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v8
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB14_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dwordx2 v[10:11], v12, s[2:3]
+; GFX942-DPP-NEXT:  .LBB14_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], -s[0:1]
+; GFX942-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v12, v[8:11], s[2:3] sc0
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX942-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB14_2
+; GFX942-DPP-NEXT:  .LBB14_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -10422,6 +12036,65 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:  .LBB15_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX942-NEXT:  .LBB15_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX942-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX942-NEXT:    s_cbranch_scc1 .LBB15_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB15_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX942-NEXT:  .LBB15_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX942-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] sc0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB15_4
+; GFX942-NEXT:  .LBB15_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -10841,6 +12514,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-DPP-NEXT:  .LBB15_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v8
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB15_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dwordx2 v[10:11], v12, s[2:3]
+; GFX942-DPP-NEXT:  .LBB15_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], -s[0:1]
+; GFX942-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v12, v[8:11], s[2:3] sc0
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX942-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB15_2
+; GFX942-DPP-NEXT:  .LBB15_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -11280,6 +13040,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX9-NEXT:  .LBB16_3:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    scratch_store_dword off, v0, off
+; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_cbranch_execz .LBB16_3
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s6, 0
+; GFX942-NEXT:    s_mov_b32 s7, 0xc3300000
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX942-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-NEXT:  .LBB16_2: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX942-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB16_2
+; GFX942-NEXT:  .LBB16_3:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b64 s[0:1], exec
@@ -11537,6 +13336,45 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defau
 ; GFX9-DPP-NEXT:  .LBB16_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_bcnt1_i32_b64 s0, exec
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, 0x43300000
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off offset:4
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-DPP-NEXT:    scratch_store_dword off, v0, off
+; GFX942-DPP-NEXT:    scratch_load_dwordx2 v[0:1], off, off
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB16_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-DPP-NEXT:    s_mov_b32 s6, 0
+; GFX942-DPP-NEXT:    s_mov_b32 s7, 0xc3300000
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[0:1], v[0:1], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX942-DPP-NEXT:    v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[2:3], s[4:5]
+; GFX942-DPP-NEXT:  .LBB16_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX942-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX942-DPP-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB16_2
+; GFX942-DPP-NEXT:  .LBB16_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_default_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b64 s[0:1], exec
@@ -11854,6 +13692,65 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-NEXT:  .LBB17_5:
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-NEXT:    v_bfrev_b32_e32 v5, 1
+; GFX942-NEXT:  .LBB17_1: ; %ComputeLoop
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_ff1_i32_b64 s4, s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s3, v1, s4
+; GFX942-NEXT:    v_readlane_b32 s2, v0, s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], 1, s4
+; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
+; GFX942-NEXT:    s_cbranch_scc1 .LBB17_1
+; GFX942-NEXT:  ; %bb.2: ; %ComputeEnd
+; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX942-NEXT:    v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB17_5
+; GFX942-NEXT:  ; %bb.3:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[34:35], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v6, s[0:1]
+; GFX942-NEXT:  .LBB17_4: ; %atomicrmw.start
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX942-NEXT:    global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX942-NEXT:    s_or_b64 s[2:3], vcc, s[2:3]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    s_cbranch_execnz .LBB17_4
+; GFX942-NEXT:  .LBB17_5:
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1064:       ; %bb.0:
 ; GFX1064-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
@@ -12273,6 +14170,93 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-DPP-NEXT:  .LBB17_3:
 ; GFX9-DPP-NEXT:    s_endpgm
 ;
+; GFX942-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
+; GFX942-DPP:       ; %bb.0:
+; GFX942-DPP-NEXT:    s_mov_b64 s[34:35], s[4:5]
+; GFX942-DPP-NEXT:    s_mov_b32 s12, s8
+; GFX942-DPP-NEXT:    s_add_u32 s8, s34, 44
+; GFX942-DPP-NEXT:    s_mov_b32 s13, s9
+; GFX942-DPP-NEXT:    s_addc_u32 s9, s35, 0
+; GFX942-DPP-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-DPP-NEXT:    s_add_u32 s4, s4, div.float.value at gotpcrel32@lo+4
+; GFX942-DPP-NEXT:    s_addc_u32 s5, s5, div.float.value at gotpcrel32@hi+12
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-DPP-NEXT:    s_mov_b32 s14, s10
+; GFX942-DPP-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-DPP-NEXT:    s_mov_b32 s32, 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-DPP-NEXT:    v_mbcnt_lo_u32_b32 v8, exec_lo, 0
+; GFX942-DPP-NEXT:    v_mbcnt_hi_u32_b32 v8, exec_hi, v8
+; GFX942-DPP-NEXT:    s_or_saveexec_b64 s[0:1], -1
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v5, v3, v1, s[0:1]
+; GFX942-DPP-NEXT:    v_cndmask_b32_e64 v4, 0, v0, s[0:1]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:1 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:2 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:4 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_shr:8 row_mask:0xf bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    v_bfrev_b32_e32 v7, 1
+; GFX942-DPP-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v7, v5 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v6, v4 row_bcast:15 row_mask:0xa bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[4:5], v[4:5], v[6:7]
+; GFX942-DPP-NEXT:    s_nop 1
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v3, v5 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_mov_b32_dpp v2, v4 row_bcast:31 row_mask:0xc bank_mask:0xf
+; GFX942-DPP-NEXT:    v_add_f64 v[2:3], v[4:5], v[2:3]
+; GFX942-DPP-NEXT:    s_nop 0
+; GFX942-DPP-NEXT:    v_readlane_b32 s3, v3, 63
+; GFX942-DPP-NEXT:    v_readlane_b32 s2, v2, 63
+; GFX942-DPP-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-DPP-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-DPP-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v8
+; GFX942-DPP-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-DPP-NEXT:    s_cbranch_execz .LBB17_3
+; GFX942-DPP-NEXT:  ; %bb.1:
+; GFX942-DPP-NEXT:    s_load_dwordx2 s[2:3], s[34:35], 0x24
+; GFX942-DPP-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-DPP-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-DPP-NEXT:    global_load_dwordx2 v[10:11], v12, s[2:3]
+; GFX942-DPP-NEXT:  .LBB17_2: ; %atomicrmw.start
+; GFX942-DPP-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_add_f64 v[8:9], v[10:11], -s[0:1]
+; GFX942-DPP-NEXT:    global_atomic_cmpswap_x2 v[0:1], v12, v[8:11], s[2:3] sc0 sc1
+; GFX942-DPP-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-DPP-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[10:11]
+; GFX942-DPP-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-DPP-NEXT:    v_mov_b64_e32 v[10:11], v[0:1]
+; GFX942-DPP-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-DPP-NEXT:    s_cbranch_execnz .LBB17_2
+; GFX942-DPP-NEXT:  .LBB17_3:
+; GFX942-DPP-NEXT:    s_endpgm
+;
 ; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_default_scope_strictfp:
 ; GFX1064-DPP:       ; %bb.0:
 ; GFX1064-DPP-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
diff --git a/llvm/test/CodeGen/AMDGPU/imm.ll b/llvm/test/CodeGen/AMDGPU/imm.ll
index 58cfd40113be2..21390003ee565 100644
--- a/llvm/test/CodeGen/AMDGPU/imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/imm.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx942 -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
 
 ; Use a 64-bit value with lo bits that can be represented as an inline constant
 define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) {
@@ -25,6 +26,17 @@ define amdgpu_kernel void @i64_imm_inline_lo(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: i64_imm_inline_lo:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 5
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x12345678
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
 entry:
   store i64 1311768464867721221, ptr addrspace(1) %out ; 0x1234567800000005
   ret void
@@ -53,6 +65,17 @@ define amdgpu_kernel void @i64_imm_inline_hi(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: i64_imm_inline_hi:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x12345678
+; GFX942-NEXT:    v_mov_b32_e32 v1, 5
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
 entry:
   store i64 21780256376, ptr addrspace(1) %out ; 0x0000000512345678
   ret void
@@ -80,6 +103,17 @@ define amdgpu_kernel void @store_imm_neg_0.0_i64(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_imm_neg_0.0_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store i64 -9223372036854775808, ptr addrspace(1) %out
   ret void
 }
@@ -104,6 +138,16 @@ define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_neg_0.0_i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store i32 -2147483648, ptr addrspace(1) %out
   ret void
 }
@@ -128,6 +172,16 @@ define amdgpu_kernel void @store_inline_imm_0.0_f32(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_0.0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store float 0.0, ptr addrspace(1) %out
   ret void
 }
@@ -152,6 +206,16 @@ define amdgpu_kernel void @store_imm_neg_0.0_f32(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_imm_neg_0.0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store float -0.0, ptr addrspace(1) %out
   ret void
 }
@@ -176,6 +240,16 @@ define amdgpu_kernel void @store_inline_imm_0.5_f32(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_0.5_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0.5
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store float 0.5, ptr addrspace(1) %out
   ret void
 }
@@ -200,6 +274,16 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f32(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_0.5_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, -0.5
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store float -0.5, ptr addrspace(1) %out
   ret void
 }
@@ -224,6 +308,16 @@ define amdgpu_kernel void @store_inline_imm_1.0_f32(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_1.0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store float 1.0, ptr addrspace(1) %out
   ret void
 }
@@ -248,6 +342,16 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f32(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_1.0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, -1.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store float -1.0, ptr addrspace(1) %out
   ret void
 }
@@ -272,6 +376,16 @@ define amdgpu_kernel void @store_inline_imm_2.0_f32(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_2.0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store float 2.0, ptr addrspace(1) %out
   ret void
 }
@@ -296,6 +410,16 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f32(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_2.0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, -2.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store float -2.0, ptr addrspace(1) %out
   ret void
 }
@@ -320,6 +444,16 @@ define amdgpu_kernel void @store_inline_imm_4.0_f32(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_4.0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 4.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store float 4.0, ptr addrspace(1) %out
   ret void
 }
@@ -344,6 +478,16 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f32(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_4.0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, -4.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store float -4.0, ptr addrspace(1) %out
   ret void
 }
@@ -368,6 +512,16 @@ define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_inv_2pi_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0.15915494
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store float 0x3FC45F3060000000, ptr addrspace(1) %out
   ret void
 }
@@ -392,6 +546,16 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(ptr addrspace(1) %out)
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_inv_2pi_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0xbe22f983
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store float 0xBFC45F3060000000, ptr addrspace(1) %out
   ret void
 }
@@ -416,6 +580,16 @@ define amdgpu_kernel void @store_literal_imm_f32(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_literal_imm_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x45800000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store float 4096.0, ptr addrspace(1) %out
   ret void
 }
@@ -442,6 +616,17 @@ define amdgpu_kernel void @add_inline_imm_0.0_f32(ptr addrspace(1) %out, float %
 ; VI-NEXT:    v_add_f32_e64 v0, s6, 0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_0.0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f32_e64 v0, s6, 0
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd float %x, 0.0
   store float %y, ptr addrspace(1) %out
   ret void
@@ -469,6 +654,17 @@ define amdgpu_kernel void @add_inline_imm_0.5_f32(ptr addrspace(1) %out, float %
 ; VI-NEXT:    v_add_f32_e64 v0, s6, 0.5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_0.5_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f32_e64 v0, s6, 0.5
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd float %x, 0.5
   store float %y, ptr addrspace(1) %out
   ret void
@@ -496,6 +692,17 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(ptr addrspace(1) %out, flo
 ; VI-NEXT:    v_add_f32_e64 v0, s6, -0.5
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_0.5_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f32_e64 v0, s6, -0.5
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd float %x, -0.5
   store float %y, ptr addrspace(1) %out
   ret void
@@ -523,6 +730,17 @@ define amdgpu_kernel void @add_inline_imm_1.0_f32(ptr addrspace(1) %out, float %
 ; VI-NEXT:    v_add_f32_e64 v0, s6, 1.0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_1.0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f32_e64 v0, s6, 1.0
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd float %x, 1.0
   store float %y, ptr addrspace(1) %out
   ret void
@@ -550,6 +768,17 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(ptr addrspace(1) %out, flo
 ; VI-NEXT:    v_add_f32_e64 v0, s6, -1.0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_1.0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f32_e64 v0, s6, -1.0
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd float %x, -1.0
   store float %y, ptr addrspace(1) %out
   ret void
@@ -577,6 +806,17 @@ define amdgpu_kernel void @add_inline_imm_2.0_f32(ptr addrspace(1) %out, float %
 ; VI-NEXT:    v_add_f32_e64 v0, s6, 2.0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_2.0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f32_e64 v0, s6, 2.0
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd float %x, 2.0
   store float %y, ptr addrspace(1) %out
   ret void
@@ -604,6 +844,17 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(ptr addrspace(1) %out, flo
 ; VI-NEXT:    v_add_f32_e64 v0, s6, -2.0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_2.0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f32_e64 v0, s6, -2.0
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd float %x, -2.0
   store float %y, ptr addrspace(1) %out
   ret void
@@ -631,6 +882,17 @@ define amdgpu_kernel void @add_inline_imm_4.0_f32(ptr addrspace(1) %out, float %
 ; VI-NEXT:    v_add_f32_e64 v0, s6, 4.0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_4.0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f32_e64 v0, s6, 4.0
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd float %x, 4.0
   store float %y, ptr addrspace(1) %out
   ret void
@@ -658,6 +920,17 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(ptr addrspace(1) %out, flo
 ; VI-NEXT:    v_add_f32_e64 v0, s6, -4.0
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_4.0_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f32_e64 v0, s6, -4.0
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd float %x, -4.0
   store float %y, ptr addrspace(1) %out
   ret void
@@ -699,6 +972,24 @@ define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(ptr addrspace(1) %out,
 ; VI-NEXT:    v_add_f32_e32 v0, 0.5, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: commute_add_inline_imm_0.5_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s7, 0xf000
+; GFX942-NEXT:    s_mov_b32 s6, -1
+; GFX942-NEXT:    s_mov_b32 s10, s6
+; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GFX942-NEXT:    s_mov_b32 s4, s0
+; GFX942-NEXT:    s_mov_b32 s5, s1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f32_e32 v0, 0.5, v0
+; GFX942-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX942-NEXT:    s_endpgm
   %x = load float, ptr addrspace(1) %in
   %y = fadd float %x, 0.5
   store float %y, ptr addrspace(1) %out
@@ -741,6 +1032,24 @@ define amdgpu_kernel void @commute_add_literal_f32(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    v_add_f32_e32 v0, 0x44800000, v0
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: commute_add_literal_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s7, 0xf000
+; GFX942-NEXT:    s_mov_b32 s6, -1
+; GFX942-NEXT:    s_mov_b32 s10, s6
+; GFX942-NEXT:    s_mov_b32 s11, s7
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s8, s2
+; GFX942-NEXT:    s_mov_b32 s9, s3
+; GFX942-NEXT:    buffer_load_dword v0, off, s[8:11], 0
+; GFX942-NEXT:    s_mov_b32 s4, s0
+; GFX942-NEXT:    s_mov_b32 s5, s1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_f32_e32 v0, 0x44800000, v0
+; GFX942-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; GFX942-NEXT:    s_endpgm
   %x = load float, ptr addrspace(1) %in
   %y = fadd float %x, 1024.0
   store float %y, ptr addrspace(1) %out
@@ -769,6 +1078,17 @@ define amdgpu_kernel void @add_inline_imm_1_f32(ptr addrspace(1) %out, float %x)
 ; VI-NEXT:    v_add_f32_e64 v0, s6, 1
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_1_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f32_e64 v0, s6, 1
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd float %x, 0x36a0000000000000
   store float %y, ptr addrspace(1) %out
   ret void
@@ -796,6 +1116,17 @@ define amdgpu_kernel void @add_inline_imm_2_f32(ptr addrspace(1) %out, float %x)
 ; VI-NEXT:    v_add_f32_e64 v0, s6, 2
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_2_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f32_e64 v0, s6, 2
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd float %x, 0x36b0000000000000
   store float %y, ptr addrspace(1) %out
   ret void
@@ -823,6 +1154,17 @@ define amdgpu_kernel void @add_inline_imm_16_f32(ptr addrspace(1) %out, float %x
 ; VI-NEXT:    v_add_f32_e64 v0, s6, 16
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_16_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f32_e64 v0, s6, 16
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd float %x, 0x36e0000000000000
   store float %y, ptr addrspace(1) %out
   ret void
@@ -852,6 +1194,18 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f32(ptr addrspace(1) %out, float
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_1_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_add_i32 s4, s6, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %xbc = bitcast float %x to i32
   %y = add i32 %xbc, -1
   %ybc = bitcast i32 %y to float
@@ -883,6 +1237,18 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f32(ptr addrspace(1) %out, float
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_2_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_add_i32 s4, s6, -2
+; GFX942-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %xbc = bitcast float %x to i32
   %y = add i32 %xbc, -2
   %ybc = bitcast i32 %y to float
@@ -914,6 +1280,18 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f32(ptr addrspace(1) %out, floa
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_16_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_add_i32 s4, s6, -16
+; GFX942-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %xbc = bitcast float %x to i32
   %y = add i32 %xbc, -16
   %ybc = bitcast i32 %y to float
@@ -943,6 +1321,17 @@ define amdgpu_kernel void @add_inline_imm_63_f32(ptr addrspace(1) %out, float %x
 ; VI-NEXT:    v_add_f32_e64 v0, s6, 63
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_63_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f32_e64 v0, s6, 63
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd float %x, 0x36ff800000000000
   store float %y, ptr addrspace(1) %out
   ret void
@@ -970,6 +1359,17 @@ define amdgpu_kernel void @add_inline_imm_64_f32(ptr addrspace(1) %out, float %x
 ; VI-NEXT:    v_add_f32_e64 v0, s6, 64
 ; VI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_64_f32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x2c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f32_e64 v0, s6, 64
+; GFX942-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd float %x, 0x3700000000000000
   store float %y, ptr addrspace(1) %out
   ret void
@@ -999,6 +1399,17 @@ define amdgpu_kernel void @add_inline_imm_0.0_f64(ptr addrspace(1) %out, [8 x i3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_0.0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], s[6:7], 0
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, 0.0
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1028,6 +1439,17 @@ define amdgpu_kernel void @add_inline_imm_0.5_f64(ptr addrspace(1) %out, [8 x i3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_0.5_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], s[6:7], 0.5
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, 0.5
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1057,6 +1479,17 @@ define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(ptr addrspace(1) %out, [8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_0.5_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], s[6:7], -0.5
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, -0.5
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1086,6 +1519,17 @@ define amdgpu_kernel void @add_inline_imm_1.0_f64(ptr addrspace(1) %out, [8 x i3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_1.0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], s[6:7], 1.0
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, 1.0
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1115,6 +1559,17 @@ define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(ptr addrspace(1) %out, [8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_1.0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], s[6:7], -1.0
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, -1.0
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1144,6 +1599,17 @@ define amdgpu_kernel void @add_inline_imm_2.0_f64(ptr addrspace(1) %out, [8 x i3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_2.0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], s[6:7], 2.0
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, 2.0
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1173,6 +1639,17 @@ define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(ptr addrspace(1) %out, [8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_2.0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], s[6:7], -2.0
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, -2.0
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1202,6 +1679,17 @@ define amdgpu_kernel void @add_inline_imm_4.0_f64(ptr addrspace(1) %out, [8 x i3
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_4.0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], s[6:7], 4.0
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, 4.0
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1231,6 +1719,17 @@ define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(ptr addrspace(1) %out, [8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_4.0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], s[6:7], -4.0
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, -4.0
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1262,6 +1761,17 @@ define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(ptr addrspace(1) %out, [8
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_inv_2pi_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], s[6:7], 0.15915494309189532
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, 0x3fc45f306dc9c882
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1295,6 +1805,19 @@ define amdgpu_kernel void @add_m_inv_2pi_f64(ptr addrspace(1) %out, [8 x i32], d
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_m_inv_2pi_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x6dc9c882
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0xbfc45f30
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, 0xbfc45f306dc9c882
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1324,6 +1847,17 @@ define amdgpu_kernel void @add_inline_imm_1_f64(ptr addrspace(1) %out, [8 x i32]
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_1_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], s[6:7], 1
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, 0x0000000000000001
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1353,6 +1887,17 @@ define amdgpu_kernel void @add_inline_imm_2_f64(ptr addrspace(1) %out, [8 x i32]
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_2_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], s[6:7], 2
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, 0x0000000000000002
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1382,6 +1927,17 @@ define amdgpu_kernel void @add_inline_imm_16_f64(ptr addrspace(1) %out, [8 x i32
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_16_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], s[6:7], 16
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, 0x0000000000000010
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1409,6 +1965,17 @@ define amdgpu_kernel void @add_inline_imm_neg_1_f64(ptr addrspace(1) %out, [8 x
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_1_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, -1
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, 0xffffffffffffffff
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1436,6 +2003,17 @@ define amdgpu_kernel void @add_inline_imm_neg_2_f64(ptr addrspace(1) %out, [8 x
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_2_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, -2
+; GFX942-NEXT:    v_mov_b32_e32 v1, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, 0xfffffffffffffffe
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1463,6 +2041,17 @@ define amdgpu_kernel void @add_inline_imm_neg_16_f64(ptr addrspace(1) %out, [8 x
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_neg_16_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, -16
+; GFX942-NEXT:    v_mov_b32_e32 v1, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, 0xfffffffffffffff0
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1492,6 +2081,17 @@ define amdgpu_kernel void @add_inline_imm_63_f64(ptr addrspace(1) %out, [8 x i32
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_63_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], s[6:7], 63
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, 0x000000000000003F
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1521,6 +2121,17 @@ define amdgpu_kernel void @add_inline_imm_64_f64(ptr addrspace(1) %out, [8 x i32
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: add_inline_imm_64_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x4c
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_add_f64 v[0:1], s[6:7], 64
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   %y = fadd double %x, 0x0000000000000040
   store double %y, ptr addrspace(1) %out
   ret void
@@ -1548,6 +2159,17 @@ define amdgpu_kernel void @store_inline_imm_0.0_f64(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_0.0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store double 0.0, ptr addrspace(1) %out
   ret void
 }
@@ -1574,6 +2196,17 @@ define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(ptr addrspace(1) %out)
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_literal_imm_neg_0.0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_bfrev_b32_e32 v1, 1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store double -0.0, ptr addrspace(1) %out
   ret void
 }
@@ -1600,6 +2233,17 @@ define amdgpu_kernel void @store_inline_imm_0.5_f64(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_0.5_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x3fe00000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store double 0.5, ptr addrspace(1) %out
   ret void
 }
@@ -1626,6 +2270,17 @@ define amdgpu_kernel void @store_inline_imm_m_0.5_f64(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_0.5_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0xbfe00000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store double -0.5, ptr addrspace(1) %out
   ret void
 }
@@ -1652,6 +2307,17 @@ define amdgpu_kernel void @store_inline_imm_1.0_f64(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_1.0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store double 1.0, ptr addrspace(1) %out
   ret void
 }
@@ -1678,6 +2344,17 @@ define amdgpu_kernel void @store_inline_imm_m_1.0_f64(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_1.0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0xbff00000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store double -1.0, ptr addrspace(1) %out
   ret void
 }
@@ -1704,6 +2381,17 @@ define amdgpu_kernel void @store_inline_imm_2.0_f64(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_2.0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store double 2.0, ptr addrspace(1) %out
   ret void
 }
@@ -1730,6 +2418,17 @@ define amdgpu_kernel void @store_inline_imm_m_2.0_f64(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_2.0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, -2.0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store double -2.0, ptr addrspace(1) %out
   ret void
 }
@@ -1756,6 +2455,17 @@ define amdgpu_kernel void @store_inline_imm_4.0_f64(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_4.0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x40100000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store double 4.0, ptr addrspace(1) %out
   ret void
 }
@@ -1782,6 +2492,17 @@ define amdgpu_kernel void @store_inline_imm_m_4.0_f64(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_4.0_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0xc0100000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store double -4.0, ptr addrspace(1) %out
   ret void
 }
@@ -1808,6 +2529,17 @@ define amdgpu_kernel void @store_inv_2pi_f64(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inv_2pi_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x6dc9c882
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x3fc45f30
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store double 0x3fc45f306dc9c882, ptr addrspace(1) %out
   ret void
 }
@@ -1834,6 +2566,17 @@ define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(ptr addrspace(1) %out)
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_inline_imm_m_inv_2pi_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x6dc9c882
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0xbfc45f30
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store double 0xbfc45f306dc9c882, ptr addrspace(1) %out
   ret void
 }
@@ -1860,6 +2603,17 @@ define amdgpu_kernel void @store_literal_imm_f64(ptr addrspace(1) %out) {
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: store_literal_imm_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s3, 0xf000
+; GFX942-NEXT:    s_mov_b32 s2, -1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0x40b00000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; GFX942-NEXT:    s_endpgm
   store double 4096.0, ptr addrspace(1) %out
   ret void
 }
@@ -1871,6 +2625,13 @@ define amdgpu_vs void @literal_folding(float %arg) {
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0xbf4353f8, v0
 ; GCN-NEXT:    exp pos0 v1, v1, v0, v0 done
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: literal_folding:
+; GFX942:       ; %bb.0: ; %main_body
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x3f4353f8, v0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0xbf4353f8, v0
+; GFX942-NEXT:    exp pos0 v1, v1, v0, v0 done
+; GFX942-NEXT:    s_endpgm
 main_body:
   %tmp = fmul float %arg, 0x3FE86A7F00000000
   %tmp1 = fmul float %arg, 0xBFE86A7F00000000
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
index a10c861601c2c..3f5c6681612a9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.writelane.ll
@@ -1,9 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 < %s | FileCheck -check-prefixes=GFX802-SDAG %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-SDAG %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX1010-SDAG %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefixes=GFX1100-SDAG %s
 
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx802 -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX802-GISEL %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx942 -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX942-GISEL %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1010 -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX1010-GISEL %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 -global-isel -global-isel-abort=2 < %s | FileCheck -check-prefixes=GFX1100-GISEL %s
 
@@ -29,6 +31,19 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
 ; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_sreg_i32:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s3
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, s2, m0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_sreg_i32:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -70,6 +85,19 @@ define amdgpu_kernel void @test_writelane_sreg_i32(ptr addrspace(1) %out, i32 %s
 ; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_sreg_i32:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dword s4, s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s3
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_sreg_i32:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -120,6 +148,22 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
 ; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_sreg_i64:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_sreg_i64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_clause 0x1
@@ -172,6 +216,22 @@ define amdgpu_kernel void @test_writelane_sreg_i64(ptr addrspace(1) %out, i64 %s
 ; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_sreg_i64:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_sreg_i64:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_clause 0x1
@@ -230,6 +290,22 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
 ; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_sreg_f64:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_sreg_f64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_clause 0x1
@@ -282,6 +358,22 @@ define amdgpu_kernel void @test_writelane_sreg_f64(ptr addrspace(1) %out, double
 ; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_sreg_f64:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_sreg_f64:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_clause 0x1
@@ -337,6 +429,19 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
 ; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_imm_sreg_i32:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, 32, s2
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i32:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_clause 0x1
@@ -382,6 +487,19 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i32(ptr addrspace(1) %out, i3
 ; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_imm_sreg_i32:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX942-GISEL-NEXT:    v_writelane_b32 v0, 32, s2
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i32:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_clause 0x1
@@ -435,6 +553,21 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
 ; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_imm_sreg_i64:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, 0, s6
+; GFX942-SDAG-NEXT:    v_writelane_b32 v0, 32, s6
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_i64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_clause 0x1
@@ -486,6 +619,21 @@ define amdgpu_kernel void @test_writelane_imm_sreg_i64(ptr addrspace(1) %out, i3
 ; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_imm_sreg_i64:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-GISEL-NEXT:    v_writelane_b32 v0, 32, s6
+; GFX942-GISEL-NEXT:    v_writelane_b32 v1, 0, s6
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_i64:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_clause 0x1
@@ -545,6 +693,23 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
 ; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_imm_sreg_f64:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX942-SDAG-NEXT:    s_mov_b32 s4, 0x40400000
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v0, 0, s6
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_imm_sreg_f64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_clause 0x1
@@ -601,6 +766,23 @@ define amdgpu_kernel void @test_writelane_imm_sreg_f64(ptr addrspace(1) %out, i3
 ; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_imm_sreg_f64:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x8
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, 0x40400000
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-GISEL-NEXT:    v_writelane_b32 v0, 0, s6
+; GFX942-GISEL-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_imm_sreg_f64:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_clause 0x1
@@ -667,6 +849,24 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
 ; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_vreg_lane_i32:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
+; GFX942-SDAG-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX942-SDAG-NEXT:    s_nop 3
+; GFX942-SDAG-NEXT:    v_writelane_b32 v2, 12, s2
+; GFX942-SDAG-NEXT:    global_store_dword v1, v2, s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i32:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -730,6 +930,24 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i32(ptr addrspace(1) %out, p
 ; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_vreg_lane_i32:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dword v0, v0, s[2:3] offset:4
+; GFX942-GISEL-NEXT:    s_load_dword s2, s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX942-GISEL-NEXT:    s_nop 3
+; GFX942-GISEL-NEXT:    v_writelane_b32 v1, 12, s2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i32:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -804,6 +1022,26 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
 ; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_vreg_lane_i64:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dword v2, v0, s[2:3] offset:8
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX942-SDAG-NEXT:    s_nop 3
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, 0, s2
+; GFX942-SDAG-NEXT:    v_writelane_b32 v0, 12, s2
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_i64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -872,6 +1110,26 @@ define amdgpu_kernel void @test_writelane_vreg_lane_i64(ptr addrspace(1) %out, p
 ; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_vreg_lane_i64:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:8
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX942-GISEL-NEXT:    s_nop 3
+; GFX942-GISEL-NEXT:    v_writelane_b32 v2, 12, s2
+; GFX942-GISEL-NEXT:    v_writelane_b32 v3, 0, s2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_i64:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -951,6 +1209,28 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_vreg_lane_f64:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX942-SDAG-NEXT:    s_mov_b32 s4, 0x40280000
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dword v2, v0, s[2:3] offset:8
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_writelane_b32 v0, 0, s2
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v3, v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_vreg_lane_f64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -1023,6 +1303,28 @@ define amdgpu_kernel void @test_writelane_vreg_lane_f64(ptr addrspace(1) %out, p
 ; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[3:4], v[1:2]
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_vreg_lane_f64:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, 0x40280000
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3] offset:8
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_nop 2
+; GFX942-GISEL-NEXT:    v_writelane_b32 v2, 0, s2
+; GFX942-GISEL-NEXT:    v_writelane_b32 v3, s4, m0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_vreg_lane_f64:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -1097,6 +1399,24 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
 ; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_m0_sreg_i32:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-SDAG-NEXT:    ;;#ASMSTART
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, -1
+; GFX942-SDAG-NEXT:    ;;#ASMEND
+; GFX942-SDAG-NEXT:    s_mov_b32 s4, m0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, s4, m0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_m0_sreg_i32:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_clause 0x1
@@ -1153,6 +1473,24 @@ define amdgpu_kernel void @test_writelane_m0_sreg_i32(ptr addrspace(1) %out, i32
 ; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_m0_sreg_i32:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-GISEL-NEXT:    ;;#ASMSTART
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, -1
+; GFX942-GISEL-NEXT:    ;;#ASMEND
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, m0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX942-GISEL-NEXT:    v_writelane_b32 v0, s4, m0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_m0_sreg_i32:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_clause 0x1
@@ -1211,6 +1549,19 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
 ; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_imm_i32:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, s2, 32
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_imm_i32:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_clause 0x1
@@ -1256,6 +1607,19 @@ define amdgpu_kernel void @test_writelane_imm_i32(ptr addrspace(1) %out, i32 %sr
 ; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_imm_i32:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dword s3, s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s3
+; GFX942-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_imm_i32:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_clause 0x1
@@ -1308,6 +1672,20 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
 ; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_imm_i64:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX942-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_imm_i64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -1354,6 +1732,20 @@ define amdgpu_kernel void @test_writelane_imm_i64(ptr addrspace(1) %out, i64 %sr
 ; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_imm_i64:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX942-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_imm_i64:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -1406,6 +1798,20 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
 ; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_imm_f64:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX942-SDAG-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_imm_f64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -1452,6 +1858,20 @@ define amdgpu_kernel void @test_writelane_imm_f64(ptr addrspace(1) %out, double
 ; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_imm_f64:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-GISEL-NEXT:    v_writelane_b32 v0, s2, 32
+; GFX942-GISEL-NEXT:    v_writelane_b32 v1, s3, 32
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_imm_f64:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -1502,6 +1922,18 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 %oldval, ptr addrs
 ; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_sreg_oldval_i32:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s3
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, s2, m0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i32:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_clause 0x1
@@ -1542,6 +1974,18 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i32(i32 %oldval, ptr addrs
 ; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_sreg_oldval_i32:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x0
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x8
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s3
+; GFX942-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i32:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_clause 0x1
@@ -1590,6 +2034,21 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 %oldval, ptr addrs
 ; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_sreg_oldval_i64:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x18
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, s7, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_i64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_clause 0x2
@@ -1639,6 +2098,21 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_i64(i64 %oldval, ptr addrs
 ; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_sreg_oldval_i64:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x18
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s8
+; GFX942-GISEL-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v1, s7, m0
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_i64:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_clause 0x2
@@ -1693,6 +2167,21 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double %oldval, ptr ad
 ; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_sreg_oldval_f64:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    s_load_dword s8, s[4:5], 0x18
+; GFX942-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s8
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, s7, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_sreg_oldval_f64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_clause 0x2
@@ -1742,6 +2231,21 @@ define amdgpu_kernel void @test_writelane_sreg_oldval_f64(double %oldval, ptr ad
 ; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_sreg_oldval_f64:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    s_load_dword s8, s[4:5], 0x18
+; GFX942-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s8
+; GFX942-GISEL-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v1, s7, m0
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[2:3]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_sreg_oldval_f64:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_clause 0x2
@@ -1792,6 +2296,17 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
 ; GFX802-SDAG-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_imm_oldval_i32:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 42
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s3
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, s2, m0
+; GFX942-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i32:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -1827,6 +2342,17 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i32(ptr addrspace(1) %out,
 ; GFX802-GISEL-NEXT:    flat_store_dword v[0:1], v2
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_imm_oldval_i32:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 42
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s3
+; GFX942-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i32:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
@@ -1870,6 +2396,20 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
 ; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_imm_oldval_i64:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 42
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_i64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_clause 0x1
@@ -1916,6 +2456,20 @@ define amdgpu_kernel void @test_writelane_imm_oldval_i64(ptr addrspace(1) %out,
 ; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_imm_oldval_i64:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 42
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX942-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_i64:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_clause 0x1
@@ -1967,6 +2521,20 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
 ; GFX802-SDAG-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-SDAG-NEXT:    s_endpgm
 ;
+; GFX942-SDAG-LABEL: test_writelane_imm_oldval_f64:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX942-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s6
+; GFX942-SDAG-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-SDAG-NEXT:    s_endpgm
+;
 ; GFX1010-SDAG-LABEL: test_writelane_imm_oldval_f64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_clause 0x1
@@ -2013,6 +2581,20 @@ define amdgpu_kernel void @test_writelane_imm_oldval_f64(ptr addrspace(1) %out,
 ; GFX802-GISEL-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; GFX802-GISEL-NEXT:    s_endpgm
 ;
+; GFX942-GISEL-LABEL: test_writelane_imm_oldval_f64:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_load_dword s6, s[4:5], 0x10
+; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40450000
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s6
+; GFX942-GISEL-NEXT:    v_writelane_b32 v0, s2, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v1, s3, m0
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-GISEL-NEXT:    s_endpgm
+;
 ; GFX1010-GISEL-LABEL: test_writelane_imm_oldval_f64:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_clause 0x1
@@ -2059,6 +2641,19 @@ define void @test_writelane_half(ptr addrspace(1) %out, half %src, i32 %src1) {
 ; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: test_writelane_half:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_writelane_b32 v4, s1, m0
+; GFX942-SDAG-NEXT:    global_store_short v[0:1], v4, off
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-SDAG-LABEL: test_writelane_half:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2095,6 +2690,19 @@ define void @test_writelane_half(ptr addrspace(1) %out, half %src, i32 %src1) {
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-GISEL-LABEL: test_writelane_half:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v4, s0, m0
+; GFX942-GISEL-NEXT:    global_store_short v[0:1], v4, off
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-GISEL-LABEL: test_writelane_half:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2137,6 +2745,19 @@ define void @test_writelane_float(ptr addrspace(1) %out, float %src, i32 %src1)
 ; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: test_writelane_float:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dword v4, v[0:1], off
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_writelane_b32 v4, s1, m0
+; GFX942-SDAG-NEXT:    global_store_dword v[0:1], v4, off
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-SDAG-LABEL: test_writelane_float:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2173,6 +2794,19 @@ define void @test_writelane_float(ptr addrspace(1) %out, float %src, i32 %src1)
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-GISEL-LABEL: test_writelane_float:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dword v4, v[0:1], off
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v4, s0, m0
+; GFX942-GISEL-NEXT:    global_store_dword v[0:1], v4, off
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-GISEL-LABEL: test_writelane_float:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2215,6 +2849,19 @@ define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1
 ; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: test_writelane_bfloat:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_writelane_b32 v4, s1, m0
+; GFX942-SDAG-NEXT:    global_store_short v[0:1], v4, off
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-SDAG-LABEL: test_writelane_bfloat:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2251,6 +2898,19 @@ define void @test_writelane_bfloat(ptr addrspace(1) %out, bfloat %src, i32 %src1
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-GISEL-LABEL: test_writelane_bfloat:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s0
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v4, s1, m0
+; GFX942-GISEL-NEXT:    global_store_short v[0:1], v4, off
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-GISEL-LABEL: test_writelane_bfloat:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2293,6 +2953,19 @@ define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) {
 ; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: test_writelane_i16:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_writelane_b32 v4, s1, m0
+; GFX942-SDAG-NEXT:    global_store_short v[0:1], v4, off
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-SDAG-LABEL: test_writelane_i16:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2329,6 +3002,19 @@ define void @test_writelane_i16(ptr addrspace(1) %out, i16 %src, i32 %src1) {
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-GISEL-LABEL: test_writelane_i16:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_ushort v4, v[0:1], off
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v4, s0, m0
+; GFX942-GISEL-NEXT:    global_store_short v[0:1], v4, off
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-GISEL-LABEL: test_writelane_i16:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2371,6 +3057,19 @@ define void @test_writelane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %s
 ; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: test_writelane_v2f16:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dword v4, v[0:1], off
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s1, v2
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_writelane_b32 v4, s1, m0
+; GFX942-SDAG-NEXT:    global_store_dword v[0:1], v4, off
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-SDAG-LABEL: test_writelane_v2f16:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2407,6 +3106,19 @@ define void @test_writelane_v2f16(ptr addrspace(1) %out, <2 x half> %src, i32 %s
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-GISEL-LABEL: test_writelane_v2f16:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dword v4, v[0:1], off
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v4, s0, m0
+; GFX942-GISEL-NEXT:    global_store_dword v[0:1], v4, off
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-GISEL-LABEL: test_writelane_v2f16:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2451,6 +3163,21 @@ define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %s
 ; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: test_readlane_v2f32:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_writelane_b32 v7, s1, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v6, s2, m0
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-SDAG-LABEL: test_readlane_v2f32:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2493,6 +3220,21 @@ define void @test_readlane_v2f32(ptr addrspace(1) %out, <2 x float> %src, i32 %s
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-GISEL-LABEL: test_readlane_v2f32:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx2 v[6:7], v[0:1], off
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s1, v4
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v6, s0, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v7, s2, m0
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[6:7], off
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-GISEL-LABEL: test_readlane_v2f32:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2556,6 +3298,34 @@ define void @test_writelane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %sr
 ; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: test_writelane_v7i32:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dwordx3 v[14:16], v[0:1], off offset:16
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s0, v9
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v4
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s1, v8
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s2, v7
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s3, v6
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-SDAG-NEXT:    v_writelane_b32 v16, s1, m0
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_writelane_b32 v13, s4, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v12, s5, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v11, s6, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v10, s7, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v15, s2, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v14, s3, m0
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
+; GFX942-SDAG-NEXT:    global_store_dwordx3 v[0:1], v[14:16], off offset:16
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-SDAG-LABEL: test_writelane_v7i32:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2642,6 +3412,34 @@ define void @test_writelane_v7i32(ptr addrspace(1) %out, <7 x i32> %src, i32 %sr
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-GISEL-LABEL: test_writelane_v7i32:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
+; GFX942-GISEL-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off offset:16
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v10, s0, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v11, s2, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v12, s3, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v13, s4, m0
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v14, s5, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v15, s6, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v16, s7, m0
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
+; GFX942-GISEL-NEXT:    global_store_dwordx3 v[0:1], v[14:16], off offset:16
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-GISEL-LABEL: test_writelane_v7i32:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2722,6 +3520,25 @@ define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %sr
 ; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: test_writelane_v8i16:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_writelane_b32 v11, s1, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v10, s2, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v9, s3, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v8, s4, m0
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-SDAG-LABEL: test_writelane_v8i16:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2775,6 +3592,25 @@ define void @test_writelane_v8i16(ptr addrspace(1) %out, <8 x i16> %src, i32 %sr
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-GISEL-LABEL: test_writelane_v8i16:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v8, s0, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v9, s2, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v10, s3, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v11, s4, m0
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-GISEL-LABEL: test_writelane_v8i16:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2834,6 +3670,25 @@ define void @test_writelane_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %sr
 ; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: test_writelane_v2i64:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_writelane_b32 v11, s1, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v10, s2, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v9, s3, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v8, s4, m0
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-SDAG-LABEL: test_writelane_v2i64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2887,6 +3742,25 @@ define void @test_writelane_v2i64(ptr addrspace(1) %out, <2 x i64> %src, i32 %sr
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-GISEL-LABEL: test_writelane_v2i64:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx4 v[8:11], v[0:1], off
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s1, v6
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v8, s0, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v9, s2, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v10, s3, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v11, s4, m0
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[8:11], off
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-GISEL-LABEL: test_writelane_v2i64:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -2955,6 +3829,31 @@ define void @test_writelane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %sr
 ; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: test_writelane_v3i64:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dwordx2 v[14:15], v[0:1], off offset:16
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s0, v8
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s3, v5
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v4
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s1, v7
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s2, v6
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_writelane_b32 v13, s3, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v12, s4, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v11, s5, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v10, s6, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v15, s1, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v14, s2, m0
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
+; GFX942-SDAG-NEXT:    global_store_dwordx2 v[0:1], v[14:15], off offset:16
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-SDAG-LABEL: test_writelane_v3i64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3037,6 +3936,34 @@ define void @test_writelane_v3i64(ptr addrspace(1) %out, <3 x i64> %src, i32 %sr
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-GISEL-LABEL: test_writelane_v3i64:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx4 v[10:13], v[0:1], off
+; GFX942-GISEL-NEXT:    global_load_dwordx4 v[14:17], v[0:1], off offset:16
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s1, v8
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s1
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v10, s0, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v11, s2, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v12, s3, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v13, s4, m0
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v14, s5, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v15, s6, m0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, v14
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, v15
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[10:13], off
+; GFX942-GISEL-NEXT:    global_store_dwordx2 v[0:1], v[2:3], off offset:16
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-GISEL-LABEL: test_writelane_v3i64:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3130,6 +4057,36 @@ define void @test_writelane_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32
 ; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: test_writelane_v4f64:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off offset:16
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s0, v10
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s3, v7
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-SDAG-NEXT:    v_writelane_b32 v15, s1, m0
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_writelane_b32 v19, s5, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v18, s6, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v17, s7, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v16, s8, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v14, s2, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v13, s3, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v12, s4, m0
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[16:19], off
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off offset:16
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-SDAG-LABEL: test_writelane_v4f64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3221,6 +4178,36 @@ define void @test_writelane_v4f64(ptr addrspace(1) %out, <4 x double> %src, i32
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-GISEL-LABEL: test_writelane_v4f64:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx4 v[12:15], v[0:1], off
+; GFX942-GISEL-NEXT:    global_load_dwordx4 v[16:19], v[0:1], off offset:16
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s1, v10
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v12, s0, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v13, s2, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v14, s3, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v15, s4, m0
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v16, s5, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v17, s6, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v18, s7, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v19, s8, m0
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[12:15], off
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[16:19], off offset:16
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-GISEL-LABEL: test_writelane_v4f64:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3344,6 +4331,56 @@ define void @test_writelane_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32
 ; GFX802-SDAG-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: test_writelane_v8f64:
+; GFX942-SDAG:       ; %bb.0:
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off offset:16
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:48
+; GFX942-SDAG-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:32
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s0, v18
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s13, v13
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s14, v12
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s15, v11
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s16, v10
+; GFX942-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s1, v9
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s3, v7
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s5, v5
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s6, v4
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s7, v3
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s9, v17
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s10, v16
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s11, v15
+; GFX942-SDAG-NEXT:    v_readfirstlane_b32 s12, v14
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-SDAG-NEXT:    v_writelane_b32 v23, s1, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v22, s2, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v21, s3, m0
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    v_writelane_b32 v35, s13, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v34, s14, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v33, s15, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v32, s16, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v20, s4, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v27, s5, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v26, s6, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v25, s7, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v24, s8, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v31, s9, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v30, s10, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v29, s11, m0
+; GFX942-SDAG-NEXT:    v_writelane_b32 v28, s12, m0
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[32:35], off offset:32
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[28:31], off offset:48
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[24:27], off
+; GFX942-SDAG-NEXT:    global_store_dwordx4 v[0:1], v[20:23], off offset:16
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-SDAG-LABEL: test_writelane_v8f64:
 ; GFX1010-SDAG:       ; %bb.0:
 ; GFX1010-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3505,6 +4542,58 @@ define void @test_writelane_v8f64(ptr addrspace(1) %out, <8 x double> %src, i32
 ; GFX802-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX802-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-GISEL-LABEL: test_writelane_v8f64:
+; GFX942-GISEL:       ; %bb.0:
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    global_load_dwordx4 v[20:23], v[0:1], off
+; GFX942-GISEL-NEXT:    global_load_dwordx4 v[24:27], v[0:1], off offset:16
+; GFX942-GISEL-NEXT:    global_load_dwordx4 v[28:31], v[0:1], off offset:32
+; GFX942-GISEL-NEXT:    global_load_dwordx4 v[32:35], v[0:1], off offset:48
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s1, v18
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s3, v4
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s4, v5
+; GFX942-GISEL-NEXT:    s_mov_b32 m0, s1
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s5, v6
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s6, v7
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s7, v8
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s8, v9
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s9, v10
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s10, v11
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s11, v12
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s12, v13
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s13, v14
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s14, v15
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s15, v16
+; GFX942-GISEL-NEXT:    v_readfirstlane_b32 s16, v17
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v20, s0, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v21, s2, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v22, s3, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v23, s4, m0
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v24, s5, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v25, s6, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v26, s7, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v27, s8, m0
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v28, s9, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v29, s10, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v30, s11, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v31, s12, m0
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    v_writelane_b32 v32, s13, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v33, s14, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v34, s15, m0
+; GFX942-GISEL-NEXT:    v_writelane_b32 v35, s16, m0
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[20:23], off
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[24:27], off offset:16
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[28:31], off offset:32
+; GFX942-GISEL-NEXT:    global_store_dwordx4 v[0:1], v[32:35], off offset:48
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX1010-GISEL-LABEL: test_writelane_v8f64:
 ; GFX1010-GISEL:       ; %bb.0:
 ; GFX1010-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
index af914bd4043cf..3c923ef0aa0e2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefix=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii < %s | FileCheck -check-prefix=CI %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
 
 define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 {
 ; SI-LABEL: round_f64:
@@ -60,6 +61,24 @@ define amdgpu_kernel void @round_f64(ptr addrspace(1) %out, double %x) #0 {
 ; CI-NEXT:    s_mov_b32 s5, s1
 ; CI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; CI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: round_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_brev_b32 s6, -2
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_trunc_f64_e32 v[2:3], s[2:3]
+; GFX942-NEXT:    v_add_f64 v[4:5], s[2:3], -v[2:3]
+; GFX942-NEXT:    v_cmp_ge_f64_e64 s[4:5], |v[4:5]|, 0.5
+; GFX942-NEXT:    s_and_b64 s[4:5], s[4:5], exec
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    v_mov_b32_e32 v4, s3
+; GFX942-NEXT:    v_bfi_b32 v1, s6, v1, v4
+; GFX942-NEXT:    v_add_f64 v[2:3], v[2:3], v[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %result = call double @llvm.round.f64(double %x) #1
   store double %result, ptr addrspace(1) %out
   ret void
@@ -128,6 +147,27 @@ define amdgpu_kernel void @v_round_f64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; CI-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; CI-NEXT:    buffer_store_dwordx2 v[2:3], v[0:1], s[0:3], 0 addr64
 ; CI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: v_round_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v6, 3, v0
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0x3ff00000
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v6, s[2:3]
+; GFX942-NEXT:    s_brev_b32 s2, -2
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX942-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX942-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, 0, v7, vcc
+; GFX942-NEXT:    v_bfi_b32 v1, s2, v0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v6, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep = getelementptr double, ptr addrspace(1) %in, i32 %tid
   %out.gep = getelementptr double, ptr addrspace(1) %out, i32 %tid
@@ -224,6 +264,34 @@ define amdgpu_kernel void @round_v2f64(ptr addrspace(1) %out, <2 x double> %in)
 ; CI-NEXT:    s_mov_b32 s2, -1
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: round_v2f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX942-NEXT:    s_brev_b32 s4, -2
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_trunc_f64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    v_add_f64 v[2:3], s[2:3], -v[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v5, s3
+; GFX942-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[2:3]|, 0.5
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_bfi_b32 v5, s4, v2, v5
+; GFX942-NEXT:    v_add_f64 v[2:3], v[0:1], v[4:5]
+; GFX942-NEXT:    v_trunc_f64_e32 v[0:1], s[0:1]
+; GFX942-NEXT:    v_add_f64 v[6:7], s[0:1], -v[0:1]
+; GFX942-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[6:7]|, 0.5
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s0, 0x3ff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v5, s0
+; GFX942-NEXT:    v_mov_b32_e32 v6, s1
+; GFX942-NEXT:    v_bfi_b32 v5, s4, v5, v6
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
   store <2 x double> %result, ptr addrspace(1) %out
   ret void
@@ -378,6 +446,53 @@ define amdgpu_kernel void @round_v4f64(ptr addrspace(1) %out, <4 x double> %in)
 ; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: round_v4f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x44
+; GFX942-NEXT:    s_brev_b32 s6, -2
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_trunc_f64_e32 v[2:3], s[10:11]
+; GFX942-NEXT:    v_add_f64 v[4:5], s[10:11], -v[2:3]
+; GFX942-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[4:5]|, 0.5
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, s11
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_bfi_b32 v1, s6, v4, v1
+; GFX942-NEXT:    v_add_f64 v[4:5], v[2:3], v[0:1]
+; GFX942-NEXT:    v_trunc_f64_e32 v[2:3], s[8:9]
+; GFX942-NEXT:    v_add_f64 v[6:7], s[8:9], -v[2:3]
+; GFX942-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[6:7]|, 0.5
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    v_mov_b32_e32 v6, s9
+; GFX942-NEXT:    v_bfi_b32 v1, s6, v1, v6
+; GFX942-NEXT:    v_trunc_f64_e32 v[6:7], s[14:15]
+; GFX942-NEXT:    v_add_f64 v[8:9], s[14:15], -v[6:7]
+; GFX942-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[8:9]|, 0.5
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_add_f64 v[2:3], v[2:3], v[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    v_mov_b32_e32 v8, s15
+; GFX942-NEXT:    v_bfi_b32 v1, s6, v1, v8
+; GFX942-NEXT:    v_add_f64 v[8:9], v[6:7], v[0:1]
+; GFX942-NEXT:    v_trunc_f64_e32 v[6:7], s[12:13]
+; GFX942-NEXT:    v_add_f64 v[10:11], s[12:13], -v[6:7]
+; GFX942-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[10:11]|, 0.5
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    v_mov_b32_e32 v10, s13
+; GFX942-NEXT:    v_bfi_b32 v1, s6, v1, v10
+; GFX942-NEXT:    v_add_f64 v[6:7], v[6:7], v[0:1]
+; GFX942-NEXT:    global_store_dwordx4 v0, v[6:9], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, v[2:5], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
   store <4 x double> %result, ptr addrspace(1) %out
   ret void
@@ -657,6 +772,91 @@ define amdgpu_kernel void @round_v8f64(ptr addrspace(1) %out, <8 x double> %in)
 ; CI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16
 ; CI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; CI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: round_v8f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x64
+; GFX942-NEXT:    s_brev_b32 s6, -2
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_trunc_f64_e32 v[0:1], s[10:11]
+; GFX942-NEXT:    v_add_f64 v[2:3], s[10:11], -v[0:1]
+; GFX942-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[2:3]|, 0.5
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v4, s11
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_bfi_b32 v9, s6, v2, v4
+; GFX942-NEXT:    v_add_f64 v[2:3], v[0:1], v[8:9]
+; GFX942-NEXT:    v_trunc_f64_e32 v[0:1], s[8:9]
+; GFX942-NEXT:    v_add_f64 v[4:5], s[8:9], -v[0:1]
+; GFX942-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[4:5]|, 0.5
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v5, s9
+; GFX942-NEXT:    v_bfi_b32 v9, s6, v4, v5
+; GFX942-NEXT:    v_trunc_f64_e32 v[4:5], s[14:15]
+; GFX942-NEXT:    v_add_f64 v[6:7], s[14:15], -v[4:5]
+; GFX942-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[6:7]|, 0.5
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v6, s2
+; GFX942-NEXT:    v_mov_b32_e32 v7, s15
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], v[8:9]
+; GFX942-NEXT:    v_bfi_b32 v9, s6, v6, v7
+; GFX942-NEXT:    v_add_f64 v[6:7], v[4:5], v[8:9]
+; GFX942-NEXT:    v_trunc_f64_e32 v[4:5], s[12:13]
+; GFX942-NEXT:    v_add_f64 v[10:11], s[12:13], -v[4:5]
+; GFX942-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[10:11]|, 0.5
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v9, s2
+; GFX942-NEXT:    v_mov_b32_e32 v10, s13
+; GFX942-NEXT:    v_bfi_b32 v9, s6, v9, v10
+; GFX942-NEXT:    v_trunc_f64_e32 v[10:11], s[18:19]
+; GFX942-NEXT:    v_add_f64 v[12:13], s[18:19], -v[10:11]
+; GFX942-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[12:13]|, 0.5
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_add_f64 v[4:5], v[4:5], v[8:9]
+; GFX942-NEXT:    v_mov_b32_e32 v9, s2
+; GFX942-NEXT:    v_mov_b32_e32 v12, s19
+; GFX942-NEXT:    v_bfi_b32 v9, s6, v9, v12
+; GFX942-NEXT:    v_add_f64 v[12:13], v[10:11], v[8:9]
+; GFX942-NEXT:    v_trunc_f64_e32 v[10:11], s[16:17]
+; GFX942-NEXT:    v_add_f64 v[14:15], s[16:17], -v[10:11]
+; GFX942-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[14:15]|, 0.5
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v9, s2
+; GFX942-NEXT:    v_mov_b32_e32 v14, s17
+; GFX942-NEXT:    v_bfi_b32 v9, s6, v9, v14
+; GFX942-NEXT:    v_trunc_f64_e32 v[14:15], s[22:23]
+; GFX942-NEXT:    v_add_f64 v[16:17], s[22:23], -v[14:15]
+; GFX942-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[16:17]|, 0.5
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_add_f64 v[10:11], v[10:11], v[8:9]
+; GFX942-NEXT:    v_mov_b32_e32 v9, s2
+; GFX942-NEXT:    v_mov_b32_e32 v16, s23
+; GFX942-NEXT:    v_bfi_b32 v9, s6, v9, v16
+; GFX942-NEXT:    v_add_f64 v[16:17], v[14:15], v[8:9]
+; GFX942-NEXT:    v_trunc_f64_e32 v[14:15], s[20:21]
+; GFX942-NEXT:    v_add_f64 v[18:19], s[20:21], -v[14:15]
+; GFX942-NEXT:    v_cmp_ge_f64_e64 s[2:3], |v[18:19]|, 0.5
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v9, s2
+; GFX942-NEXT:    v_mov_b32_e32 v18, s21
+; GFX942-NEXT:    v_bfi_b32 v9, s6, v9, v18
+; GFX942-NEXT:    v_add_f64 v[14:15], v[14:15], v[8:9]
+; GFX942-NEXT:    global_store_dwordx4 v8, v[14:17], s[0:1] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v8, v[10:13], s[0:1] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
   store <8 x double> %result, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/lower-lds-with-alias-scope.ll b/llvm/test/CodeGen/AMDGPU/lower-lds-with-alias-scope.ll
index d8d7fc1d7a3bd..4ceed675d41c3 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-lds-with-alias-scope.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-lds-with-alias-scope.ll
@@ -1,31 +1,52 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O3 < %s | FileCheck -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -O3 < %s | FileCheck -check-prefixes=GCN,GFX942 %s
 
 @a = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 4
 @b = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 4
 @c = internal unnamed_addr addrspace(3) global [64 x i32] poison, align 4
 
 define amdgpu_kernel void @ds_load_stores_aainfo(ptr addrspace(1) %arg, i32 %i) {
-; GCN-LABEL: ds_load_stores_aainfo:
-; GCN:       ; %bb.0: ; %bb
-; GCN-NEXT:    s_load_dword s0, s[4:5], 0x2c
-; GCN-NEXT:    v_mov_b32_e32 v0, 1
-; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    s_lshl_b32 s0, s0, 2
-; GCN-NEXT:    v_mov_b32_e32 v4, s0
-; GCN-NEXT:    ds_read2_b32 v[2:3], v4 offset1:1
-; GCN-NEXT:    ds_write_b64 v1, v[0:1] offset:512
-; GCN-NEXT:    ds_read2_b32 v[4:5], v4 offset0:64 offset1:65
-; GCN-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
-; GCN-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
-; GCN-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
-; GCN-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
-; GCN-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
-; GCN-NEXT:    s_endpgm
+; GFX9-LABEL: ds_load_stores_aainfo:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GFX9-NEXT:    v_mov_b32_e32 v0, 1
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX9-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NEXT:    ds_read2_b32 v[2:3], v4 offset1:1
+; GFX9-NEXT:    ds_write_b64 v1, v[0:1] offset:512
+; GFX9-NEXT:    ds_read2_b32 v[4:5], v4 offset0:64 offset1:65
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX9-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GFX9-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GFX9-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v4
+; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX9-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: ds_load_stores_aainfo:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x2c
+; GFX942-NEXT:    v_mov_b32_e32 v0, 1
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX942-NEXT:    v_mov_b32_e32 v4, s0
+; GFX942-NEXT:    ds_read2_b32 v[2:3], v4 offset1:1
+; GFX942-NEXT:    ds_write_b64 v1, v[0:1] offset:512
+; GFX942-NEXT:    ds_read2_b32 v[4:5], v4 offset0:64 offset1:65
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GFX942-NEXT:    ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GFX942-NEXT:    ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
 bb:
   %gep.a = getelementptr inbounds [64 x i32], ptr addrspace(3) @a, i32 0, i32 %i
   %gep.b = getelementptr inbounds [64 x i32], ptr addrspace(3) @b, i32 0, i32 %i
@@ -52,3 +73,5 @@ bb:
   !5 = !{!3}
   !6 = !{!7}
   !7 = !{!7, !4}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/lround.ll b/llvm/test/CodeGen/AMDGPU/lround.ll
index 8036e32f90eb0..d2eb839f705c4 100644
--- a/llvm/test/CodeGen/AMDGPU/lround.ll
+++ b/llvm/test/CodeGen/AMDGPU/lround.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --extra_scrub --version 5
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -check-prefixes=GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GFX942-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10-SDAG %s
 ; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck -check-prefixes=GFX10-GISEL %s
 ; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-SDAG-TRUE16 %s
@@ -35,6 +37,34 @@ define i32 @intrinsic_lround_i32_f32(float %arg) {
 ; GFX9-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: intrinsic_lround_i32_f32:
+; GFX942-SDAG:       ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX942-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX942-SDAG-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
+; GFX942-SDAG-NEXT:    s_brev_b32 s0, -2
+; GFX942-SDAG-NEXT:    v_bfi_b32 v0, s0, v2, v0
+; GFX942-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX942-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: intrinsic_lround_i32_f32:
+; GFX942-GISEL:       ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX942-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX942-GISEL-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
+; GFX942-GISEL-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX942-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX942-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: intrinsic_lround_i32_f32:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -124,6 +154,37 @@ define i32 @intrinsic_lround_i32_f64(double %arg) {
 ; GFX9-GISEL-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: intrinsic_lround_i32_f64:
+; GFX942-SDAG:       ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX942-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
+; GFX942-SDAG-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX942-SDAG-NEXT:    s_brev_b32 s0, -2
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-SDAG-NEXT:    v_bfi_b32 v1, s0, v0, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX942-SDAG-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: intrinsic_lround_i32_f64:
+; GFX942-GISEL:       ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX942-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX942-GISEL-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX942-GISEL-NEXT:    s_brev_b32 s0, 1
+; GFX942-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX942-GISEL-NEXT:    v_and_or_b32 v1, v1, s0, v4
+; GFX942-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX942-GISEL-NEXT:    v_cvt_i32_f64_e32 v0, v[0:1]
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: intrinsic_lround_i32_f64:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -237,6 +298,60 @@ define i64 @intrinsic_lround_i64_f32(float %arg) {
 ; GFX9-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: intrinsic_lround_i64_f32:
+; GFX942-SDAG:       ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX942-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX942-SDAG-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
+; GFX942-SDAG-NEXT:    s_brev_b32 s0, -2
+; GFX942-SDAG-NEXT:    v_bfi_b32 v0, s0, v2, v0
+; GFX942-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX942-SDAG-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX942-SDAG-NEXT:    s_mov_b32 s0, 0x2f800000
+; GFX942-SDAG-NEXT:    v_mul_f32_e64 v1, |v0|, s0
+; GFX942-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX942-SDAG-NEXT:    s_mov_b32 s0, 0xcf800000
+; GFX942-SDAG-NEXT:    v_fma_f32 v2, v1, s0, |v0|
+; GFX942-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v0, v2, v3
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: intrinsic_lround_i64_f32:
+; GFX942-GISEL:       ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX942-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX942-GISEL-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
+; GFX942-GISEL-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX942-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX942-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; GFX942-GISEL-NEXT:    v_mul_f32_e64 v2, |v1|, v2
+; GFX942-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcf800000
+; GFX942-GISEL-NEXT:    v_fma_f32 v1, v2, v3, |v1|
+; GFX942-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: intrinsic_lround_i64_f32:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -391,6 +506,50 @@ define i64 @intrinsic_lround_i64_f64(double %arg) {
 ; GFX9-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: intrinsic_lround_i64_f64:
+; GFX942-SDAG:       ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX942-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
+; GFX942-SDAG-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX942-SDAG-NEXT:    s_brev_b32 s0, -2
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-SDAG-NEXT:    v_bfi_b32 v1, s0, v0, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX942-SDAG-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX942-SDAG-NEXT:    s_movk_i32 s0, 0xffe0
+; GFX942-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s0
+; GFX942-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX942-SDAG-NEXT:    v_fmac_f64_e32 v[0:1], 0xc1f00000, v[2:3]
+; GFX942-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX942-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: intrinsic_lround_i64_f64:
+; GFX942-GISEL:       ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX942-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX942-GISEL-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX942-GISEL-NEXT:    s_brev_b32 s0, 1
+; GFX942-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX942-GISEL-NEXT:    v_and_or_b32 v1, v1, s0, v4
+; GFX942-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX942-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3df00000
+; GFX942-GISEL-NEXT:    v_mul_f64 v[0:1], v[2:3], v[0:1]
+; GFX942-GISEL-NEXT:    v_floor_f64_e32 v[0:1], v[0:1]
+; GFX942-GISEL-NEXT:    v_fmac_f64_e32 v[2:3], 0xc1f00000, v[0:1]
+; GFX942-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[0:1]
+; GFX942-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[2:3]
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: intrinsic_lround_i64_f64:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -528,6 +687,60 @@ define i64 @intrinsic_llround_i64_f32(float %arg) {
 ; GFX9-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: intrinsic_llround_i64_f32:
+; GFX942-SDAG:       ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX942-SDAG-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX942-SDAG-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
+; GFX942-SDAG-NEXT:    s_brev_b32 s0, -2
+; GFX942-SDAG-NEXT:    v_bfi_b32 v0, s0, v2, v0
+; GFX942-SDAG-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX942-SDAG-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX942-SDAG-NEXT:    s_mov_b32 s0, 0x2f800000
+; GFX942-SDAG-NEXT:    v_mul_f32_e64 v1, |v0|, s0
+; GFX942-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX942-SDAG-NEXT:    s_mov_b32 s0, 0xcf800000
+; GFX942-SDAG-NEXT:    v_fma_f32 v2, v1, s0, |v0|
+; GFX942-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-SDAG-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v0, v2, v3
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: intrinsic_llround_i64_f32:
+; GFX942-GISEL:       ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX942-GISEL-NEXT:    v_sub_f32_e32 v2, v0, v1
+; GFX942-GISEL-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v2|, 0.5
+; GFX942-GISEL-NEXT:    v_bfrev_b32_e32 v3, 1
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, 1.0, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_or_b32 v0, v0, v3, v2
+; GFX942-GISEL-NEXT:    v_add_f32_e32 v0, v1, v0
+; GFX942-GISEL-NEXT:    v_trunc_f32_e32 v1, v0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v2, 0x2f800000
+; GFX942-GISEL-NEXT:    v_mul_f32_e64 v2, |v1|, v2
+; GFX942-GISEL-NEXT:    v_floor_f32_e32 v2, v2
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0xcf800000
+; GFX942-GISEL-NEXT:    v_fma_f32 v1, v2, v3, |v1|
+; GFX942-GISEL-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v0
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v0, v1, v3
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v1, v2, v3
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: intrinsic_llround_i64_f32:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -682,6 +895,50 @@ define i64 @intrinsic_llround_i64_f64(double %arg) {
 ; GFX9-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: intrinsic_llround_i64_f64:
+; GFX942-SDAG:       ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX942-SDAG-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
+; GFX942-SDAG-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX942-SDAG-NEXT:    s_brev_b32 s0, -2
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-SDAG-NEXT:    v_bfi_b32 v1, s0, v0, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-SDAG-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX942-SDAG-NEXT:    v_trunc_f64_e32 v[0:1], v[0:1]
+; GFX942-SDAG-NEXT:    s_movk_i32 s0, 0xffe0
+; GFX942-SDAG-NEXT:    v_ldexp_f64 v[2:3], v[0:1], s0
+; GFX942-SDAG-NEXT:    v_floor_f64_e32 v[2:3], v[2:3]
+; GFX942-SDAG-NEXT:    v_fmac_f64_e32 v[0:1], 0xc1f00000, v[2:3]
+; GFX942-SDAG-NEXT:    v_cvt_u32_f64_e32 v0, v[0:1]
+; GFX942-SDAG-NEXT:    v_cvt_i32_f64_e32 v1, v[2:3]
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: intrinsic_llround_i64_f64:
+; GFX942-GISEL:       ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX942-GISEL-NEXT:    v_add_f64 v[4:5], v[0:1], -v[2:3]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, 0x3ff00000
+; GFX942-GISEL-NEXT:    v_cmp_ge_f64_e64 vcc, |v[4:5]|, 0.5
+; GFX942-GISEL-NEXT:    s_brev_b32 s0, 1
+; GFX942-GISEL-NEXT:    v_and_or_b32 v0, v0, 0, 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v6, vcc
+; GFX942-GISEL-NEXT:    v_and_or_b32 v1, v1, s0, v4
+; GFX942-GISEL-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX942-GISEL-NEXT:    v_trunc_f64_e32 v[2:3], v[0:1]
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3df00000
+; GFX942-GISEL-NEXT:    v_mul_f64 v[0:1], v[2:3], v[0:1]
+; GFX942-GISEL-NEXT:    v_floor_f64_e32 v[0:1], v[0:1]
+; GFX942-GISEL-NEXT:    v_fmac_f64_e32 v[2:3], 0xc1f00000, v[0:1]
+; GFX942-GISEL-NEXT:    v_cvt_i32_f64_e32 v1, v[0:1]
+; GFX942-GISEL-NEXT:    v_cvt_u32_f64_e32 v0, v[2:3]
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: intrinsic_llround_i64_f64:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -795,6 +1052,34 @@ define half @intrinsic_fround_half(half %arg) {
 ; GFX9-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: intrinsic_fround_half:
+; GFX942-SDAG:       ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX942-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX942-SDAG-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX942-SDAG-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX942-SDAG-NEXT:    v_bfi_b32 v0, s0, v2, v0
+; GFX942-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: intrinsic_fround_half:
+; GFX942-GISEL:       ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX942-GISEL-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX942-GISEL-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX942-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: intrinsic_fround_half:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -914,6 +1199,38 @@ define i32 @intrinsic_lround_i32_f16(half %arg) {
 ; GFX9-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: intrinsic_lround_i32_f16:
+; GFX942-SDAG:       ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX942-SDAG-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX942-SDAG-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX942-SDAG-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX942-SDAG-NEXT:    s_movk_i32 s0, 0x7fff
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX942-SDAG-NEXT:    v_bfi_b32 v0, s0, v2, v0
+; GFX942-SDAG-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX942-SDAG-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX942-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: intrinsic_lround_i32_f16:
+; GFX942-GISEL:       ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_trunc_f16_e32 v1, v0
+; GFX942-GISEL-NEXT:    v_sub_f16_e32 v2, v0, v1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0x3c00
+; GFX942-GISEL-NEXT:    v_cmp_ge_f16_e64 vcc, |v2|, 0.5
+; GFX942-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e32 v2, 0, v3, vcc
+; GFX942-GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX942-GISEL-NEXT:    v_add_f16_e32 v0, v1, v0
+; GFX942-GISEL-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX942-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: intrinsic_lround_i32_f16:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1059,6 +1376,50 @@ define <2 x i32> @intrinsic_lround_v2i32_v2f32(<2 x float> %arg) {
 ; GFX9-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v1
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX942-SDAG:       ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX942-SDAG-NEXT:    v_sub_f32_e32 v3, v0, v2
+; GFX942-SDAG-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, 0.5
+; GFX942-SDAG-NEXT:    s_brev_b32 s2, -2
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[0:1]
+; GFX942-SDAG-NEXT:    v_bfi_b32 v0, s2, v3, v0
+; GFX942-SDAG-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX942-SDAG-NEXT:    v_trunc_f32_e32 v2, v1
+; GFX942-SDAG-NEXT:    v_sub_f32_e32 v3, v1, v2
+; GFX942-SDAG-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, 0.5
+; GFX942-SDAG-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX942-SDAG-NEXT:    s_nop 0
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[0:1]
+; GFX942-SDAG-NEXT:    v_bfi_b32 v1, s2, v3, v1
+; GFX942-SDAG-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX942-SDAG-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: intrinsic_lround_v2i32_v2f32:
+; GFX942-GISEL:       ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX942-GISEL-NEXT:    v_sub_f32_e32 v3, v0, v2
+; GFX942-GISEL-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, 0.5
+; GFX942-GISEL-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_or_b32 v0, v0, v4, v3
+; GFX942-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX942-GISEL-NEXT:    v_trunc_f32_e32 v2, v1
+; GFX942-GISEL-NEXT:    v_sub_f32_e32 v3, v1, v2
+; GFX942-GISEL-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, 0.5
+; GFX942-GISEL-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_or_b32 v1, v1, v4, v3
+; GFX942-GISEL-NEXT:    v_add_f32_e32 v1, v2, v1
+; GFX942-GISEL-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: intrinsic_lround_v2i32_v2f32:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1230,6 +1591,93 @@ define <2 x i64> @intrinsic_lround_v2i64_v2f32(<2 x float> %arg) {
 ; GFX9-GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
 ; GFX9-GISEL-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-SDAG-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX942-SDAG:       ; %bb.0: ; %entry
+; GFX942-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-SDAG-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX942-SDAG-NEXT:    v_sub_f32_e32 v3, v0, v2
+; GFX942-SDAG-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, 0.5
+; GFX942-SDAG-NEXT:    s_brev_b32 s2, -2
+; GFX942-SDAG-NEXT:    s_mov_b32 s3, 0x2f800000
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[0:1]
+; GFX942-SDAG-NEXT:    v_bfi_b32 v0, s2, v3, v0
+; GFX942-SDAG-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX942-SDAG-NEXT:    v_trunc_f32_e32 v0, v0
+; GFX942-SDAG-NEXT:    v_mul_f32_e64 v2, |v0|, s3
+; GFX942-SDAG-NEXT:    v_floor_f32_e32 v2, v2
+; GFX942-SDAG-NEXT:    s_mov_b32 s4, 0xcf800000
+; GFX942-SDAG-NEXT:    v_fma_f32 v3, v2, s4, |v0|
+; GFX942-SDAG-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v4, 31, v0
+; GFX942-SDAG-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v0, v3, v4
+; GFX942-SDAG-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX942-SDAG-NEXT:    v_sub_f32_e32 v5, v1, v3
+; GFX942-SDAG-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v5|, 0.5
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v2, v2, v4
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX942-SDAG-NEXT:    v_cndmask_b32_e64 v5, 0, 1.0, s[0:1]
+; GFX942-SDAG-NEXT:    v_bfi_b32 v1, s2, v5, v1
+; GFX942-SDAG-NEXT:    v_add_f32_e32 v1, v3, v1
+; GFX942-SDAG-NEXT:    v_trunc_f32_e32 v3, v1
+; GFX942-SDAG-NEXT:    v_mul_f32_e64 v1, |v3|, s3
+; GFX942-SDAG-NEXT:    v_floor_f32_e32 v1, v1
+; GFX942-SDAG-NEXT:    v_fma_f32 v5, v1, s4, |v3|
+; GFX942-SDAG-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX942-SDAG-NEXT:    v_cvt_u32_f32_e32 v6, v1
+; GFX942-SDAG-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v4, vcc
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v2, v5, v3
+; GFX942-SDAG-NEXT:    v_xor_b32_e32 v4, v6, v3
+; GFX942-SDAG-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v3
+; GFX942-SDAG-NEXT:    s_nop 1
+; GFX942-SDAG-NEXT:    v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX942-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-GISEL-LABEL: intrinsic_lround_v2i64_v2f32:
+; GFX942-GISEL:       ; %bb.0: ; %entry
+; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-GISEL-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX942-GISEL-NEXT:    v_sub_f32_e32 v3, v0, v2
+; GFX942-GISEL-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v3|, 0.5
+; GFX942-GISEL-NEXT:    v_bfrev_b32_e32 v4, 1
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v6, 0xcf800000
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v3, 0, 1.0, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_or_b32 v0, v0, v4, v3
+; GFX942-GISEL-NEXT:    v_add_f32_e32 v0, v2, v0
+; GFX942-GISEL-NEXT:    v_trunc_f32_e32 v2, v0
+; GFX942-GISEL-NEXT:    v_mov_b32_e32 v3, 0x2f800000
+; GFX942-GISEL-NEXT:    v_mul_f32_e64 v5, |v2|, v3
+; GFX942-GISEL-NEXT:    v_floor_f32_e32 v5, v5
+; GFX942-GISEL-NEXT:    v_fma_f32 v2, v5, v6, |v2|
+; GFX942-GISEL-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v5
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v7, 31, v0
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v0, v2, v7
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v2, v5, v7
+; GFX942-GISEL-NEXT:    v_trunc_f32_e32 v5, v1
+; GFX942-GISEL-NEXT:    v_sub_f32_e32 v8, v1, v5
+; GFX942-GISEL-NEXT:    v_cmp_ge_f32_e64 s[0:1], |v8|, 0.5
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v7
+; GFX942-GISEL-NEXT:    s_nop 0
+; GFX942-GISEL-NEXT:    v_cndmask_b32_e64 v8, 0, 1.0, s[0:1]
+; GFX942-GISEL-NEXT:    v_and_or_b32 v1, v1, v4, v8
+; GFX942-GISEL-NEXT:    v_add_f32_e32 v4, v5, v1
+; GFX942-GISEL-NEXT:    v_trunc_f32_e32 v1, v4
+; GFX942-GISEL-NEXT:    v_mul_f32_e64 v3, |v1|, v3
+; GFX942-GISEL-NEXT:    v_floor_f32_e32 v3, v3
+; GFX942-GISEL-NEXT:    v_fma_f32 v1, v3, v6, |v1|
+; GFX942-GISEL-NEXT:    v_cvt_u32_f32_e32 v5, v1
+; GFX942-GISEL-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX942-GISEL-NEXT:    v_ashrrev_i32_e32 v4, 31, v4
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v2, v7, vcc
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v2, v5, v4
+; GFX942-GISEL-NEXT:    v_xor_b32_e32 v3, v3, v4
+; GFX942-GISEL-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v4
+; GFX942-GISEL-NEXT:    s_nop 1
+; GFX942-GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX942-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-SDAG-LABEL: intrinsic_lround_v2i64_v2f32:
 ; GFX10-SDAG:       ; %bb.0: ; %entry
 ; GFX10-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
index 05ff5c8bb0b3a..c11456793e1b6 100644
--- a/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
+++ b/llvm/test/CodeGen/AMDGPU/module-lds-false-sharing.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX9 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX942 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,GFX10 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx900 < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX9 %s
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 < %s --amdgpu-lower-module-lds-strategy=module | FileCheck -enable-var-scope -check-prefixes=CHECK,G_GFX10 %s
@@ -35,6 +36,16 @@ define void @nonkernel() {
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX942-LABEL: nonkernel:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    ds_write_b32 v0, v0 offset:8
+; GFX942-NEXT:    ds_write_b64 v0, v[0:1]
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX10-LABEL: nonkernel:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -107,6 +118,28 @@ define amdgpu_kernel void @withcall() {
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX9-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: withcall:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_mov_b32 s12, s8
+; GFX942-NEXT:    s_add_u32 s8, s4, 36
+; GFX942-NEXT:    s_mov_b32 s13, s9
+; GFX942-NEXT:    s_addc_u32 s9, s5, 0
+; GFX942-NEXT:    s_getpc_b64 s[4:5]
+; GFX942-NEXT:    s_add_u32 s4, s4, nonkernel at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s5, s5, nonkernel at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX942-NEXT:    s_mov_b32 s14, s10
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v31, v0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    ds_write_b32 v1, v1 offset:8
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+; GFX942-NEXT:    s_endpgm
+;
 ; GFX10-LABEL: withcall:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_mov_b32 s20, SCRATCH_RSRC_DWORD0
diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
index f0c8fed925673..528b6b6dc1238 100644
--- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll
@@ -1,10 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
 ; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-SDAG-O0 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck -check-prefixes=GFX942,GFX942-SDAG %s
+; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck -check-prefixes=GFX942-O0,GFX942-SDAG-O0 %s
 
 ; FIXME: GlobalISel missing the power-of-2 cases in legalization. https://github.com/llvm/llvm-project/issues/80671
-; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9 %s
-; xUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-O0 %s}}
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9-ISEL %s
+; xUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-ISEL-O0 %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck -check-prefixes=GFX942,GFX942-ISEL %s
+; xUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -o - %s | FileCheck -check-prefixes=GFX942-O0,GFX942-ISEL-O0 %s
 
 define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-LABEL: v_srem_i128_vv:
@@ -1497,6 +1501,1355 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_srem_i128_vv:
+; GFX942:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_sub_co_u32_e32 v10, vcc, 0, v0
+; GFX942-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v12, vcc, 0, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v13, vcc, 0, v2, vcc
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v14, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v9, v8
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v12, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v14, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v2, v13, vcc
+; GFX942-NEXT:    v_sub_co_u32_e32 v10, vcc, 0, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v12, vcc, 0, v5, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v13, vcc, 0, v6, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v14, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_cmp_gt_i64_e32 vcc, 0, v[6:7]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v5, v12, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v7, v7, v14, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v6, v6, v13, vcc
+; GFX942-NEXT:    v_or_b32_e32 v13, v5, v7
+; GFX942-NEXT:    v_or_b32_e32 v12, v4, v6
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; GFX942-NEXT:    v_or_b32_e32 v13, v1, v3
+; GFX942-NEXT:    v_or_b32_e32 v12, v0, v2
+; GFX942-NEXT:    v_ffbh_u32_e32 v10, v6
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[12:13]
+; GFX942-NEXT:    v_add_u32_e32 v10, 32, v10
+; GFX942-NEXT:    v_ffbh_u32_e32 v12, v7
+; GFX942-NEXT:    v_min_u32_e32 v14, v10, v12
+; GFX942-NEXT:    v_ffbh_u32_e32 v10, v4
+; GFX942-NEXT:    v_add_u32_e32 v10, 32, v10
+; GFX942-NEXT:    v_ffbh_u32_e32 v12, v5
+; GFX942-NEXT:    v_min_u32_e32 v10, v10, v12
+; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    v_lshl_add_u64 v[12:13], v[10:11], 0, 64
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-NEXT:    v_ffbh_u32_e32 v10, v2
+; GFX942-NEXT:    v_add_u32_e32 v10, 32, v10
+; GFX942-NEXT:    v_cndmask_b32_e32 v14, v12, v14, vcc
+; GFX942-NEXT:    v_ffbh_u32_e32 v12, v3
+; GFX942-NEXT:    v_min_u32_e32 v16, v10, v12
+; GFX942-NEXT:    v_ffbh_u32_e32 v10, v0
+; GFX942-NEXT:    v_add_u32_e32 v10, 32, v10
+; GFX942-NEXT:    v_ffbh_u32_e32 v12, v1
+; GFX942-NEXT:    v_min_u32_e32 v10, v10, v12
+; GFX942-NEXT:    v_cndmask_b32_e64 v15, v13, 0, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[12:13], v[10:11], 0, 64
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v10, v13, 0, vcc
+; GFX942-NEXT:    v_sub_co_u32_e32 v14, vcc, v14, v12
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v15, vcc, v15, v10, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v12, vcc, 0, v11, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v13, vcc, 0, v11, vcc
+; GFX942-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[14:15]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v10, 0, 1, vcc
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v11, 0, 1, vcc
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v10, v11, v10, vcc
+; GFX942-NEXT:    v_and_b32_e32 v10, 1, v10
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v10
+; GFX942-NEXT:    v_xor_b32_e32 v10, 0x7f, v14
+; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-NEXT:    v_or_b32_e32 v11, v15, v13
+; GFX942-NEXT:    v_or_b32_e32 v10, v10, v12
+; GFX942-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-NEXT:    v_cndmask_b32_e64 v17, v3, 0, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v16, v2, 0, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v11, v1, 0, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v10, v0, 0, s[0:1]
+; GFX942-NEXT:    s_and_b64 s[0:1], s[2:3], vcc
+; GFX942-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB0_6
+; GFX942-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-NEXT:    v_add_co_u32_e32 v10, vcc, 1, v14
+; GFX942-NEXT:    v_sub_u32_e32 v20, 0x7f, v14
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v15, vcc
+; GFX942-NEXT:    v_sub_u32_e32 v15, 64, v20
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v12, vcc, 0, v12, vcc
+; GFX942-NEXT:    v_or_b32_e32 v16, v10, v12
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v13, vcc
+; GFX942-NEXT:    v_or_b32_e32 v17, v11, v13
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; GFX942-NEXT:    v_lshlrev_b64 v[16:17], v20, v[2:3]
+; GFX942-NEXT:    v_lshrrev_b64 v[18:19], v15, v[0:1]
+; GFX942-NEXT:    v_sub_u32_e32 v14, 63, v14
+; GFX942-NEXT:    v_or_b32_e32 v17, v17, v19
+; GFX942-NEXT:    v_or_b32_e32 v16, v16, v18
+; GFX942-NEXT:    v_lshlrev_b64 v[14:15], v14, v[0:1]
+; GFX942-NEXT:    v_cmp_gt_u32_e64 s[0:1], 64, v20
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v20
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v15, v15, v17, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v14, v14, v16, s[0:1]
+; GFX942-NEXT:    v_lshlrev_b64 v[16:17], v20, v[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v15, v15, v3, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e64 v14, v14, v2, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e64 v17, 0, v17, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v16, 0, v16, s[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], 0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[2:3], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB0_5
+; GFX942-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-NEXT:    v_sub_u32_e32 v20, 64, v10
+; GFX942-NEXT:    v_lshrrev_b64 v[18:19], v10, v[0:1]
+; GFX942-NEXT:    v_lshlrev_b64 v[20:21], v20, v[2:3]
+; GFX942-NEXT:    v_or_b32_e32 v20, v18, v20
+; GFX942-NEXT:    v_subrev_u32_e32 v18, 64, v10
+; GFX942-NEXT:    v_or_b32_e32 v21, v19, v21
+; GFX942-NEXT:    v_lshrrev_b64 v[18:19], v18, v[2:3]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v10
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v10
+; GFX942-NEXT:    v_mov_b64_e32 v[30:31], 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v19, v19, v21, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v18, v18, v20, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v27, v19, v1, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v26, v18, v0, s[0:1]
+; GFX942-NEXT:    v_lshrrev_b64 v[18:19], v10, v[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e32 v29, 0, v19, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v28, 0, v18, vcc
+; GFX942-NEXT:    v_add_co_u32_e32 v22, vcc, -1, v4
+; GFX942-NEXT:    v_mov_b64_e32 v[20:21], 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v5, vcc
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v24, vcc, -1, v6, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v19, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v25, vcc, -1, v7, vcc
+; GFX942-NEXT:  .LBB0_3: ; %udiv-do-while
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v18, 31, v27
+; GFX942-NEXT:    v_lshlrev_b64 v[26:27], 1, v[26:27]
+; GFX942-NEXT:    v_lshrrev_b32_e32 v34, 31, v15
+; GFX942-NEXT:    v_lshlrev_b64 v[28:29], 1, v[28:29]
+; GFX942-NEXT:    v_or_b32_e32 v26, v26, v34
+; GFX942-NEXT:    v_or_b32_e32 v28, v28, v18
+; GFX942-NEXT:    v_sub_co_u32_e32 v18, vcc, v22, v26
+; GFX942-NEXT:    v_lshlrev_b64 v[32:33], 1, v[16:17]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v18, vcc, v23, v27, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v16, 31, v17
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v18, vcc, v24, v28, vcc
+; GFX942-NEXT:    v_lshlrev_b64 v[14:15], 1, v[14:15]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v18, vcc, v25, v29, vcc
+; GFX942-NEXT:    v_or3_b32 v14, v14, v16, v20
+; GFX942-NEXT:    v_or_b32_e32 v16, v30, v32
+; GFX942-NEXT:    v_ashrrev_i32_e32 v30, 31, v18
+; GFX942-NEXT:    v_or_b32_e32 v17, v31, v33
+; GFX942-NEXT:    v_and_b32_e32 v18, 1, v30
+; GFX942-NEXT:    v_and_b32_e32 v31, v30, v7
+; GFX942-NEXT:    v_and_b32_e32 v32, v30, v6
+; GFX942-NEXT:    v_and_b32_e32 v33, v30, v5
+; GFX942-NEXT:    v_and_b32_e32 v30, v30, v4
+; GFX942-NEXT:    v_sub_co_u32_e32 v26, vcc, v26, v30
+; GFX942-NEXT:    v_or3_b32 v15, v15, 0, v21
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v27, vcc, v27, v33, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v28, vcc, v28, v32, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v29, vcc, v29, v31, vcc
+; GFX942-NEXT:    v_add_co_u32_e32 v10, vcc, -1, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v11, vcc, -1, v11, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v12, vcc, -1, v12, vcc
+; GFX942-NEXT:    v_or_b32_e32 v30, v10, v12
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v13, vcc, -1, v13, vcc
+; GFX942-NEXT:    v_or_b32_e32 v31, v11, v13
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[30:31]
+; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[30:31], v[18:19]
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execnz .LBB0_3
+; GFX942-NEXT:  ; %bb.4: ; %Flow
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:  .LBB0_5: ; %Flow2
+; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    v_lshlrev_b64 v[10:11], 1, v[16:17]
+; GFX942-NEXT:    v_lshrrev_b32_e32 v16, 31, v17
+; GFX942-NEXT:    v_lshlrev_b64 v[12:13], 1, v[14:15]
+; GFX942-NEXT:    v_or3_b32 v17, v13, 0, v21
+; GFX942-NEXT:    v_or3_b32 v16, v12, v16, v20
+; GFX942-NEXT:    v_or_b32_e32 v11, v19, v11
+; GFX942-NEXT:    v_or_b32_e32 v10, v18, v10
+; GFX942-NEXT:  .LBB0_6: ; %Flow3
+; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-NEXT:    v_mul_lo_u32 v12, v11, v6
+; GFX942-NEXT:    v_mul_lo_u32 v13, v10, v7
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v10, v6, 0
+; GFX942-NEXT:    v_add3_u32 v7, v7, v13, v12
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v16, v4, v[6:7]
+; GFX942-NEXT:    v_mul_lo_u32 v12, v16, v5
+; GFX942-NEXT:    v_mul_lo_u32 v13, v17, v4
+; GFX942-NEXT:    v_add3_u32 v7, v13, v7, v12
+; GFX942-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v4, v10, 0
+; GFX942-NEXT:    v_mov_b32_e32 v15, 0
+; GFX942-NEXT:    v_mov_b32_e32 v14, v13
+; GFX942-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v5, v10, v[14:15]
+; GFX942-NEXT:    v_mov_b32_e32 v14, v17
+; GFX942-NEXT:    v_mov_b32_e32 v17, v15
+; GFX942-NEXT:    v_mad_u64_u32 v[16:17], s[0:1], v4, v11, v[16:17]
+; GFX942-NEXT:    v_mov_b32_e32 v18, v17
+; GFX942-NEXT:    v_mov_b32_e32 v19, v15
+; GFX942-NEXT:    v_lshl_add_u64 v[14:15], v[14:15], 0, v[18:19]
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v5, v11, v[14:15]
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v12
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v16, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v0, v0, v8
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v1, v1, v9
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v8
+; GFX942-NEXT:    v_xor_b32_e32 v2, v2, v8
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v9, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v3, v3, v9
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v8, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v9, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-O0-LABEL: v_srem_i128_vv:
+; GFX942-O0:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-O0-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-O0-NEXT:    scratch_store_dword off, v29, s32 offset:216 ; 4-byte Folded Spill
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v8, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v7
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v5
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    s_mov_b32 s0, 63
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
+; GFX942-O0-NEXT:    v_ashrrev_i64 v[2:3], s0, v[2:3]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a0, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a1, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a2, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a3, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v13
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-O0-NEXT:    ; implicit-def: $vgpr29 : SGPR spill to VGPR lane
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s2, 0
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s3, 1
+; GFX942-O0-NEXT:    s_mov_b32 s6, s2
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s6, 2
+; GFX942-O0-NEXT:    s_mov_b32 s7, s3
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s7, 3
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v10, vcc, s6, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s7
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v5, vcc, v4, v3, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s6
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v0, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, s7
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v6, v1, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v11
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[12:13], s[0:1]
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v5, s[0:1]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s[0:1]
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s[0:1]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, v9
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v18
+; GFX942-O0-NEXT:    v_mov_b32_e32 v8, v19
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v14, vcc, s6, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, s7
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v11, vcc, v9, v10, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, s6
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v9, v7, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, s7
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v9, vcc, v9, v8, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v15
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_cmp_lt_i64_e64 s[0:1], v[18:19], s[0:1]
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v11, s[0:1]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v10
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v9
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v13
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v12
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[0:1]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v8
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[14:15], v[4:5]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a4, v15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a5, v14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[14:15], v[16:17]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a6, v15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a7, v14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[14:15], v[12:13]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a8, v15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a9, v14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[14:15], v[18:19]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a10, v15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a11, v14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[14:15], v[12:13]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a12, v15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a13, v14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[14:15], v[18:19]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a14, v15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a15, v14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[14:15], v[4:5]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a16, v15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a17, v14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[14:15], v[16:17]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a18, v15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a19, v14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v13
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v19
+; GFX942-O0-NEXT:    v_or_b32_e64 v9, v9, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v18
+; GFX942-O0-NEXT:    v_or_b32_e64 v14, v11, v14
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v9
+; GFX942-O0-NEXT:    v_cmp_eq_u64_e64 s[0:1], v[14:15], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v17
+; GFX942-O0-NEXT:    v_or_b32_e64 v9, v9, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v16
+; GFX942-O0-NEXT:    v_or_b32_e64 v14, v11, v14
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v9
+; GFX942-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[14:15], s[2:3]
+; GFX942-O0-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[12:13], s[4:5]
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v7, v7
+; GFX942-O0-NEXT:    s_mov_b32 s11, 32
+; GFX942-O0-NEXT:    v_add_u32_e64 v7, v7, s11
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v8, v8
+; GFX942-O0-NEXT:    v_min_u32_e64 v8, v7, v8
+; GFX942-O0-NEXT:    s_mov_b32 s10, 0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, s10
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX942-O0-NEXT:    v_add_u32_e64 v6, v6, s11
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v10, v10
+; GFX942-O0-NEXT:    v_min_u32_e64 v10, v6, v10
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, s10
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v6
+; GFX942-O0-NEXT:    s_mov_b64 s[8:9], 64
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[10:11], v[10:11], 0, s[8:9]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v8, v7, v8, s[4:5]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v6
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[4:5], s[4:5]
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v4, v0
+; GFX942-O0-NEXT:    v_add_u32_e64 v4, v4, s11
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v5, v1
+; GFX942-O0-NEXT:    v_min_u32_e64 v6, v4, v5
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s10
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v4, v2
+; GFX942-O0-NEXT:    v_add_u32_e64 v4, v4, s11
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v10, v3
+; GFX942-O0-NEXT:    v_min_u32_e64 v10, v4, v10
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s10
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[10:11], v[10:11], 0, s[8:9]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v11
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v10, v5, v6, s[4:5]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v7
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v6, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, s6
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v5, v6, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, s7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, s7
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v5, v6, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a20, v5 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a21, v4 ; Reload Reuse
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v6
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a22, v9 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a23, v8 ; Reload Reuse
+; GFX942-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[8:9], s[2:3]
+; GFX942-O0-NEXT:    s_mov_b64 s[8:9], 0x7f
+; GFX942-O0-NEXT:    v_cmp_gt_u64_e64 s[10:11], v[4:5], s[8:9]
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[10:11]
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[10:11], v[8:9], s[2:3]
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[10:11]
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX942-O0-NEXT:    v_and_b32_e64 v6, 1, v6
+; GFX942-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, 1
+; GFX942-O0-NEXT:    s_or_b64 s[4:5], s[0:1], s[4:5]
+; GFX942-O0-NEXT:    s_mov_b64 s[10:11], -1
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], s[4:5]
+; GFX942-O0-NEXT:    s_xor_b64 s[0:1], s[0:1], s[10:11]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX942-O0-NEXT:    s_mov_b32 s10, s9
+; GFX942-O0-NEXT:    v_xor_b32_e64 v6, v6, s10
+; GFX942-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX942-O0-NEXT:    v_xor_b32_e64 v4, v4, s8
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[2:3], v[4:5], s[2:3]
+; GFX942-O0-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s7
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[8:9]
+; GFX942-O0-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[8:9]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX942-O0-NEXT:    s_mov_b64 s[8:9], s[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s7
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[8:9]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-O0-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a24, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a25, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a26, v1 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a27, v0 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s0, 4
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s1, 5
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a28, v29 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-O0-NEXT:    s_cbranch_execz .LBB0_3
+; GFX942-O0-NEXT:    s_branch .LBB0_8
+; GFX942-O0-NEXT:  .LBB0_1: ; %Flow
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a28 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    v_readlane_b32 s0, v29, 6
+; GFX942-O0-NEXT:    v_readlane_b32 s1, v29, 7
+; GFX942-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-O0-NEXT:  ; %bb.2: ; %Flow
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v7, a29 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v6, a30 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v5, a31 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_load_dword v4, off, s32 offset:32 ; 4-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[2:3], off, s32 offset:40 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:48 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:24 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:16 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:8 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_branch .LBB0_5
+; GFX942-O0-NEXT:  .LBB0_3: ; %Flow2
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a28 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    v_readlane_b32 s0, v29, 4
+; GFX942-O0-NEXT:    v_readlane_b32 s1, v29, 5
+; GFX942-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v1, a24 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v0, a25 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v3, a26 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v2, a27 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:64 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:56 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_branch .LBB0_9
+; GFX942-O0-NEXT:  .LBB0_4: ; %udiv-loop-exit
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[6:7], off, s32 offset:72 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[8:9], off, s32 offset:80 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:88 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[4:5], off, s32 offset:96 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    s_mov_b32 s0, 1
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[2:3], s0, v[0:1]
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[10:11], s0, v[4:5]
+; GFX942-O0-NEXT:    s_mov_b32 s0, 63
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[0:1], s0, v[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v9
+; GFX942-O0-NEXT:    v_or3_b32 v4, v4, v5, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX942-O0-NEXT:    v_or3_b32 v0, v0, v1, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v7
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v6
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a24, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a25, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a26, v1 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a27, v0 ; Reload Reuse
+; GFX942-O0-NEXT:    s_branch .LBB0_3
+; GFX942-O0-NEXT:  .LBB0_5: ; %Flow1
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a28 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    v_readlane_b32 s0, v29, 8
+; GFX942-O0-NEXT:    v_readlane_b32 s1, v29, 9
+; GFX942-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:24 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[2:3], off, s32 offset:16 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[4:5], off, s32 offset:8 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[6:7], off, s32 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:80 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:72 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:96 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:88 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_branch .LBB0_4
+; GFX942-O0-NEXT:  .LBB0_6: ; %udiv-do-while
+; GFX942-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a28 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    v_readlane_b32 s2, v29, 10
+; GFX942-O0-NEXT:    v_readlane_b32 s3, v29, 11
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[6:7], off, s32 offset:104 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:112 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[26:27], off, s32 offset:120 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[2:3], off, s32 offset:128 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[16:17], off, s32 offset:136 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[8:9], off, s32 offset:144 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[24:25], off, s32 offset:152 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[10:11], off, s32 offset:160 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v19, a14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v18, a15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v21, a12 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v20, a13 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[14:15], off, s32 offset:168 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[22:23], off, s32 offset:176 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    s_mov_b32 s0, 63
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(6)
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[12:13], s0, v[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX942-O0-NEXT:    s_mov_b32 s1, 1
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[26:27], s1, v[26:27]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v27
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 killed $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v26
+; GFX942-O0-NEXT:    v_or_b32_e64 v12, v5, v12
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v4
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[26:27], s1, v[2:3]
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[4:5], s0, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v27
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v26
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v3, v4
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[2:3], s1, v[0:1]
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[26:27], s1, v[6:7]
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[0:1], s0, v[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v27
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v28, v25
+; GFX942-O0-NEXT:    v_or3_b32 v6, v6, v7, v28
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v26
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v24
+; GFX942-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX942-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v10
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v22
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v23
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v11, v4, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-O0-NEXT:    v_ashrrev_i64 v[14:15], s0, v[12:13]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], 1
+; GFX942-O0-NEXT:    s_mov_b32 s4, s1
+; GFX942-O0-NEXT:    v_and_b32_e64 v12, v7, s4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX942-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX942-O0-NEXT:    v_and_b32_e64 v14, v11, s0
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[12:13], 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v21
+; GFX942-O0-NEXT:    v_and_b32_e64 v22, v7, v22
+; GFX942-O0-NEXT:    v_and_b32_e64 v20, v11, v20
+; GFX942-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v21, v22
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v19
+; GFX942-O0-NEXT:    v_and_b32_e64 v7, v7, v22
+; GFX942-O0-NEXT:    v_and_b32_e64 v22, v11, v18
+; GFX942-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v23, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v22
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v23
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v19
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v8
+; GFX942-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], -1
+; GFX942-O0-NEXT:    s_mov_b32 s1, s4
+; GFX942-O0-NEXT:    s_mov_b32 s0, s5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, s1
+; GFX942-O0-NEXT:    v_add_co_u32_e32 v20, vcc, v11, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, s0
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, s1
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, s0
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v21, v9
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v8
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[8:9], v[16:17]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[10:11], v[20:21]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v17
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v21
+; GFX942-O0-NEXT:    v_or_b32_e64 v18, v18, v19
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, v20
+; GFX942-O0-NEXT:    v_or_b32_e64 v16, v16, v17
+; GFX942-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX942-O0-NEXT:    v_cmp_eq_u64_e64 s[0:1], v[16:17], v[12:13]
+; GFX942-O0-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[16:17], v[2:3]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a29, v17 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a30, v16 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[16:17], v[0:1]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a31, v17 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_store_dword off, v16, s32 offset:32 ; 4-byte Folded Spill
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[16:17], v[14:15]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[16:17], s32 offset:40 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[16:17], v[12:13]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[16:17], s32 offset:48 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s2, 6
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s3, 7
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s2, 10
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s3, 11
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a28, v29 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[14:15], s32 offset:160 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[12:13], s32 offset:152 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[10:11], s32 offset:144 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[8:9], s32 offset:136 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:128 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:120 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:112 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:104 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-O0-NEXT:    s_cbranch_execnz .LBB0_6
+; GFX942-O0-NEXT:    s_branch .LBB0_1
+; GFX942-O0-NEXT:  .LBB0_7: ; %udiv-preheader
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a28 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:184 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[2:3], off, s32 offset:192 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[8:9], off, s32 offset:200 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[10:11], off, s32 offset:208 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v17, a12 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v16, a13 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v13, a14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v12, a15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v15, a16 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v14, a17 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v19, a18 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v18, a19 ; Reload Reuse
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[18:19]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    s_mov_b32 s0, 64
+; GFX942-O0-NEXT:    v_sub_u32_e64 v20, s0, v4
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v21
+; GFX942-O0-NEXT:    v_or_b32_e64 v5, v5, v22
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v20
+; GFX942-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v7
+; GFX942-O0-NEXT:    v_cmp_lt_u32_e64 s[2:3], v4, s0
+; GFX942-O0-NEXT:    v_sub_u32_e64 v5, v4, s0
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[20:21], v5, v[14:15]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v21
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v22, s[2:3]
+; GFX942-O0-NEXT:    s_mov_b32 s0, 0
+; GFX942-O0-NEXT:    v_cmp_eq_u32_e64 s[0:1], v4, s0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v19
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v22, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v20
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v18
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[0:1]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[14:15]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v5
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-O0-NEXT:    s_mov_b32 s4, s1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, s4
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-O0-NEXT:    s_mov_b32 s4, s0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[2:3]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v13
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], -1
+; GFX942-O0-NEXT:    s_mov_b32 s3, s4
+; GFX942-O0-NEXT:    s_mov_b32 s2, s5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, s3
+; GFX942-O0-NEXT:    v_add_co_u32_e32 v16, vcc, v15, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, s2
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v15, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, s3
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v18, vcc, v14, v15, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, s2
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v14, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v13
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v12
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[14:15], s[0:1]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[18:19], s32 offset:168 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[16:17], s32 offset:176 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s0, 10
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s1, 11
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a28, v29 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[14:15], s32 offset:160 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[12:13], s32 offset:152 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[10:11], s32 offset:144 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[8:9], s32 offset:136 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:128 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:120 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:112 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:104 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_branch .LBB0_6
+; GFX942-O0-NEXT:  .LBB0_8: ; %udiv-bb1
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a28 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v7, a18 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v6, a19 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v11, a16 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v10, a17 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v5, a22 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v4, a23 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v1, a20 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v0, a21 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], 1
+; GFX942-O0-NEXT:    s_mov_b32 s1, s2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-O0-NEXT:    s_mov_b32 s0, s3
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-O0-NEXT:    s_mov_b32 s4, s2
+; GFX942-O0-NEXT:    s_mov_b32 s5, s3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s0
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s5
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:200 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:208 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_mov_b32 s0, 0x7f
+; GFX942-O0-NEXT:    v_sub_u32_e64 v2, s0, v3
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[10:11]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v5
+; GFX942-O0-NEXT:    s_mov_b32 s0, 64
+; GFX942-O0-NEXT:    v_sub_u32_e64 v13, s0, v2
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[14:15], v13, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v15
+; GFX942-O0-NEXT:    v_or_b32_e64 v12, v12, v13
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX942-O0-NEXT:    v_cmp_lt_u32_e64 s[0:1], v2, s0
+; GFX942-O0-NEXT:    s_mov_b32 s6, 63
+; GFX942-O0-NEXT:    v_sub_u32_e64 v3, s6, v3
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[12:13], v3, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[0:1]
+; GFX942-O0-NEXT:    s_mov_b32 s6, 0
+; GFX942-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v2, s6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v11
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[6:7]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[6:7], v2, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, s4
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v3, v6, s[0:1]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:192 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:184 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v8
+; GFX942-O0-NEXT:    v_or_b32_e64 v0, v0, v1
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[0:1], v[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:24 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:16 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:8 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-O0-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX942-O0-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s2, 8
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s3, 9
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[16:17], -1
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a28, v29 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[16:17]
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-O0-NEXT:    s_cbranch_execz .LBB0_5
+; GFX942-O0-NEXT:    s_branch .LBB0_7
+; GFX942-O0-NEXT:  .LBB0_9: ; %udiv-end
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v7, a0 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v6, a1 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v9, a2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v8, a3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v13, a4 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v12, a5 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v15, a6 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v14, a7 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v19, a10 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v18, a11 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[16:17], off, s32 offset:64 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[4:5], off, s32 offset:56 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v11, a8 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v10, a9 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b32 s0, 32
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[0:1], s0, v[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v10
+; GFX942-O0-NEXT:    v_mul_lo_u32 v3, v1, v0
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[10:11], s0, v[10:11]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, v4
+; GFX942-O0-NEXT:    v_mul_lo_u32 v2, v10, v2
+; GFX942-O0-NEXT:    v_mad_u64_u32 v[4:5], s[2:3], v10, v0, 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX942-O0-NEXT:    v_add3_u32 v2, v0, v2, v3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[2:3], s0, v[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v3
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    s_mov_b32 s1, 0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, s1
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX942-O0-NEXT:    v_or_b32_e64 v0, v0, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v2, v3
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v0
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[2:3], s0, v[18:19]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v16
+; GFX942-O0-NEXT:    v_mul_lo_u32 v3, v2, v11
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[16:17], s0, v[16:17]
+; GFX942-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v18
+; GFX942-O0-NEXT:    v_mul_lo_u32 v16, v16, v0
+; GFX942-O0-NEXT:    v_mad_u64_u32 v[18:19], s[2:3], v2, v0, 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v19
+; GFX942-O0-NEXT:    v_add3_u32 v2, v2, v3, v16
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, s2
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v16
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[2:3], s0, v[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v3
+; GFX942-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 killed $vgpr18_vgpr19 killed $exec
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, s1
+; GFX942-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, v19
+; GFX942-O0-NEXT:    v_or_b32_e64 v16, v16, v17
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v18
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v16
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[4:5], v[2:3], 0, v[4:5]
+; GFX942-O0-NEXT:    v_mad_u64_u32 v[18:19], s[2:3], v11, v1, 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v18
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, s1
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v19
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, s2
+; GFX942-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v17
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[18:19], s0, v[18:19]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v19
+; GFX942-O0-NEXT:    v_or_b32_e64 v16, v16, v17
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v18
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v16
+; GFX942-O0-NEXT:    v_mad_u64_u32 v[18:19], s[2:3], v11, v10, 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, v18
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, s1
+; GFX942-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v17
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v19
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v20, s2
+; GFX942-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v20
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[18:19], s0, v[18:19]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v20, v19
+; GFX942-O0-NEXT:    v_or_b32_e64 v11, v11, v20
+; GFX942-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 killed $vgpr16_vgpr17 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX942-O0-NEXT:    v_or_b32_e64 v16, v16, v17
+; GFX942-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v11
+; GFX942-O0-NEXT:    v_mad_u64_u32 v[10:11], s[2:3], v0, v10, 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v11
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v20, s1
+; GFX942-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v20
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[16:17], v[16:17], 0, v[18:19]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v17
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], 0xffffffff
+; GFX942-O0-NEXT:    s_mov_b32 s4, s3
+; GFX942-O0-NEXT:    v_and_b32_e64 v20, v18, s4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v16
+; GFX942-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3
+; GFX942-O0-NEXT:    v_and_b32_e64 v18, v18, s2
+; GFX942-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v20
+; GFX942-O0-NEXT:    v_mad_u64_u32 v[22:23], s[2:3], v0, v1, 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v22
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v20, s1
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v20
+; GFX942-O0-NEXT:    v_mov_b32_e32 v20, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v23
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v21, s2
+; GFX942-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v23, v21
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[22:23], s0, v[22:23]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v21, v23
+; GFX942-O0-NEXT:    v_or_b32_e64 v20, v20, v21
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v22
+; GFX942-O0-NEXT:    v_or_b32_e64 v0, v0, v1
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v20
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[18:19]
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[18:19], s0, v[0:1]
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[16:17], s0, v[16:17]
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[16:17], v[16:17], 0, v[18:19]
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[16:17]
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[0:1], s0, v[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v11
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v10
+; GFX942-O0-NEXT:    v_or_b32_e64 v0, v0, v1
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v14
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v15
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v13
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v11
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v10, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v5, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v2, vcc, v1, v2, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX942-O0-NEXT:    v_xor_b32_e64 v3, v3, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX942-O0-NEXT:    v_xor_b32_e64 v8, v5, v4
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    v_xor_b32_e64 v3, v3, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX942-O0-NEXT:    v_xor_b32_e64 v0, v0, v7
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v9
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v7
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v5, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v3, v4, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[6:7], s0, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[4:5], s0, v[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-O0-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-O0-NEXT:    scratch_load_dword v29, off, s32 offset:216 ; 4-byte Folded Reload
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    s_setpc_b64 s[30:31]
   %div = srem i128 %lhs, %rhs
   ret i128 %div
 }
@@ -2837,6 +4190,1185 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) {
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_urem_i128_vv:
+; GFX942:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_or_b32_e32 v9, v5, v7
+; GFX942-NEXT:    v_or_b32_e32 v8, v4, v6
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX942-NEXT:    v_or_b32_e32 v9, v1, v3
+; GFX942-NEXT:    v_or_b32_e32 v8, v0, v2
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[8:9]
+; GFX942-NEXT:    v_ffbh_u32_e32 v8, v6
+; GFX942-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX942-NEXT:    v_ffbh_u32_e32 v9, v7
+; GFX942-NEXT:    v_min_u32_e32 v12, v8, v9
+; GFX942-NEXT:    v_ffbh_u32_e32 v8, v4
+; GFX942-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX942-NEXT:    v_ffbh_u32_e32 v10, v5
+; GFX942-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-NEXT:    v_min_u32_e32 v8, v8, v10
+; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    v_lshl_add_u64 v[10:11], v[8:9], 0, 64
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; GFX942-NEXT:    v_ffbh_u32_e32 v8, v2
+; GFX942-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX942-NEXT:    v_cndmask_b32_e32 v12, v10, v12, vcc
+; GFX942-NEXT:    v_ffbh_u32_e32 v10, v3
+; GFX942-NEXT:    v_min_u32_e32 v14, v8, v10
+; GFX942-NEXT:    v_ffbh_u32_e32 v8, v0
+; GFX942-NEXT:    v_add_u32_e32 v8, 32, v8
+; GFX942-NEXT:    v_ffbh_u32_e32 v10, v1
+; GFX942-NEXT:    v_min_u32_e32 v8, v8, v10
+; GFX942-NEXT:    v_cndmask_b32_e64 v13, v11, 0, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[10:11], v[8:9], 0, 64
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GFX942-NEXT:    s_mov_b64 s[2:3], 0x7f
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v10, v10, v14, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, v11, 0, vcc
+; GFX942-NEXT:    v_sub_co_u32_e32 v12, vcc, v12, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v13, vcc, v13, v8, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v10, vcc, 0, v9, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v11, vcc, 0, v9, vcc
+; GFX942-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[12:13]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, 1, vcc
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, 0, 1, vcc
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v8, v9, v8, vcc
+; GFX942-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v8
+; GFX942-NEXT:    v_xor_b32_e32 v8, 0x7f, v12
+; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-NEXT:    v_or_b32_e32 v9, v13, v11
+; GFX942-NEXT:    v_or_b32_e32 v8, v8, v10
+; GFX942-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX942-NEXT:    v_cndmask_b32_e64 v15, v3, 0, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v14, v2, 0, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, v1, 0, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, v0, 0, s[0:1]
+; GFX942-NEXT:    s_and_b64 s[0:1], s[2:3], vcc
+; GFX942-NEXT:    s_and_saveexec_b64 s[4:5], s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB1_6
+; GFX942-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-NEXT:    v_add_co_u32_e32 v8, vcc, 1, v12
+; GFX942-NEXT:    v_sub_u32_e32 v18, 0x7f, v12
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v13, vcc
+; GFX942-NEXT:    v_sub_u32_e32 v13, 64, v18
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v10, vcc, 0, v10, vcc
+; GFX942-NEXT:    v_or_b32_e32 v14, v8, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX942-NEXT:    v_or_b32_e32 v15, v9, v11
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX942-NEXT:    v_lshlrev_b64 v[14:15], v18, v[2:3]
+; GFX942-NEXT:    v_lshrrev_b64 v[16:17], v13, v[0:1]
+; GFX942-NEXT:    v_sub_u32_e32 v12, 63, v12
+; GFX942-NEXT:    v_or_b32_e32 v15, v15, v17
+; GFX942-NEXT:    v_or_b32_e32 v14, v14, v16
+; GFX942-NEXT:    v_lshlrev_b64 v[12:13], v12, v[0:1]
+; GFX942-NEXT:    v_cmp_gt_u32_e64 s[0:1], 64, v18
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[2:3], 0, v18
+; GFX942-NEXT:    v_mov_b64_e32 v[16:17], 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v13, v13, v15, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v12, v12, v14, s[0:1]
+; GFX942-NEXT:    v_lshlrev_b64 v[14:15], v18, v[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v13, v13, v3, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e64 v12, v12, v2, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e64 v15, 0, v15, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v14, 0, v14, s[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], 0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[2:3], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB1_5
+; GFX942-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-NEXT:    v_sub_u32_e32 v18, 64, v8
+; GFX942-NEXT:    v_lshrrev_b64 v[16:17], v8, v[0:1]
+; GFX942-NEXT:    v_lshlrev_b64 v[18:19], v18, v[2:3]
+; GFX942-NEXT:    v_or_b32_e32 v18, v16, v18
+; GFX942-NEXT:    v_subrev_u32_e32 v16, 64, v8
+; GFX942-NEXT:    v_or_b32_e32 v19, v17, v19
+; GFX942-NEXT:    v_lshrrev_b64 v[16:17], v16, v[2:3]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v8
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v8
+; GFX942-NEXT:    v_mov_b64_e32 v[28:29], 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v17, v17, v19, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v25, v17, v1, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v24, v16, v0, s[0:1]
+; GFX942-NEXT:    v_lshrrev_b64 v[16:17], v8, v[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e32 v27, 0, v17, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v26, 0, v16, vcc
+; GFX942-NEXT:    v_add_co_u32_e32 v20, vcc, -1, v4
+; GFX942-NEXT:    v_mov_b64_e32 v[18:19], 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v21, vcc, -1, v5, vcc
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v22, vcc, -1, v6, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v23, vcc, -1, v7, vcc
+; GFX942-NEXT:  .LBB1_3: ; %udiv-do-while
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v16, 31, v25
+; GFX942-NEXT:    v_lshlrev_b64 v[24:25], 1, v[24:25]
+; GFX942-NEXT:    v_lshrrev_b32_e32 v32, 31, v13
+; GFX942-NEXT:    v_lshlrev_b64 v[26:27], 1, v[26:27]
+; GFX942-NEXT:    v_or_b32_e32 v24, v24, v32
+; GFX942-NEXT:    v_or_b32_e32 v26, v26, v16
+; GFX942-NEXT:    v_sub_co_u32_e32 v16, vcc, v20, v24
+; GFX942-NEXT:    v_lshlrev_b64 v[30:31], 1, v[14:15]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v16, vcc, v21, v25, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v14, 31, v15
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v16, vcc, v22, v26, vcc
+; GFX942-NEXT:    v_lshlrev_b64 v[12:13], 1, v[12:13]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v16, vcc, v23, v27, vcc
+; GFX942-NEXT:    v_or3_b32 v12, v12, v14, v18
+; GFX942-NEXT:    v_or_b32_e32 v14, v28, v30
+; GFX942-NEXT:    v_ashrrev_i32_e32 v28, 31, v16
+; GFX942-NEXT:    v_or_b32_e32 v15, v29, v31
+; GFX942-NEXT:    v_and_b32_e32 v16, 1, v28
+; GFX942-NEXT:    v_and_b32_e32 v29, v28, v7
+; GFX942-NEXT:    v_and_b32_e32 v30, v28, v6
+; GFX942-NEXT:    v_and_b32_e32 v31, v28, v5
+; GFX942-NEXT:    v_and_b32_e32 v28, v28, v4
+; GFX942-NEXT:    v_sub_co_u32_e32 v24, vcc, v24, v28
+; GFX942-NEXT:    v_or3_b32 v13, v13, 0, v19
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v25, vcc, v25, v31, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v26, vcc, v26, v30, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v27, vcc, v27, v29, vcc
+; GFX942-NEXT:    v_add_co_u32_e32 v8, vcc, -1, v8
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, -1, v9, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v10, vcc, -1, v10, vcc
+; GFX942-NEXT:    v_or_b32_e32 v28, v8, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v11, vcc, -1, v11, vcc
+; GFX942-NEXT:    v_or_b32_e32 v29, v9, v11
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[28:29]
+; GFX942-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[28:29], v[16:17]
+; GFX942-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX942-NEXT:  ; %bb.4: ; %Flow
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:  .LBB1_5: ; %Flow2
+; GFX942-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-NEXT:    v_lshlrev_b64 v[8:9], 1, v[14:15]
+; GFX942-NEXT:    v_lshrrev_b32_e32 v14, 31, v15
+; GFX942-NEXT:    v_lshlrev_b64 v[10:11], 1, v[12:13]
+; GFX942-NEXT:    v_or3_b32 v15, v11, 0, v19
+; GFX942-NEXT:    v_or3_b32 v14, v10, v14, v18
+; GFX942-NEXT:    v_or_b32_e32 v9, v17, v9
+; GFX942-NEXT:    v_or_b32_e32 v8, v16, v8
+; GFX942-NEXT:  .LBB1_6: ; %Flow3
+; GFX942-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-NEXT:    v_mul_lo_u32 v10, v9, v6
+; GFX942-NEXT:    v_mul_lo_u32 v11, v8, v7
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v8, v6, 0
+; GFX942-NEXT:    v_add3_u32 v7, v7, v11, v10
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v14, v4, v[6:7]
+; GFX942-NEXT:    v_mul_lo_u32 v10, v14, v5
+; GFX942-NEXT:    v_mul_lo_u32 v11, v15, v4
+; GFX942-NEXT:    v_add3_u32 v7, v11, v7, v10
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v4, v8, 0
+; GFX942-NEXT:    v_mov_b32_e32 v13, 0
+; GFX942-NEXT:    v_mov_b32_e32 v12, v11
+; GFX942-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v5, v8, v[12:13]
+; GFX942-NEXT:    v_mov_b32_e32 v12, v15
+; GFX942-NEXT:    v_mov_b32_e32 v15, v13
+; GFX942-NEXT:    v_mad_u64_u32 v[14:15], s[0:1], v4, v9, v[14:15]
+; GFX942-NEXT:    v_mov_b32_e32 v16, v15
+; GFX942-NEXT:    v_mov_b32_e32 v17, v13
+; GFX942-NEXT:    v_lshl_add_u64 v[12:13], v[12:13], 0, v[16:17]
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v5, v9, v[12:13]
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v10
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[6:7]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v14, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-O0-LABEL: v_urem_i128_vv:
+; GFX942-O0:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-O0-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-O0-NEXT:    scratch_store_dword off, v29, s32 offset:200 ; 4-byte Folded Spill
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, v6
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a0, v4 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v0
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v0, a0 ; Reload Reuse
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v7
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a1, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a2, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a3, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a4, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a5, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a6, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a7, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a8, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], v[10:11]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a9, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a10, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], v[0:1]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a11, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a12, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], v[12:13]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a13, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a14, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], v[14:15]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a15, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a16, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v8, v1
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v8, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v0
+; GFX942-O0-NEXT:    v_or_b32_e64 v0, v4, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-O0-NEXT:    ; implicit-def: $vgpr29 : SGPR spill to VGPR lane
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s2, 0
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s3, 1
+; GFX942-O0-NEXT:    v_cmp_eq_u64_e64 s[0:1], v[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v13
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v15
+; GFX942-O0-NEXT:    v_or_b32_e64 v7, v3, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v14
+; GFX942-O0-NEXT:    v_or_b32_e64 v14, v2, v0
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v7
+; GFX942-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[14:15], s[2:3]
+; GFX942-O0-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v5, v5
+; GFX942-O0-NEXT:    s_mov_b32 s7, 32
+; GFX942-O0-NEXT:    v_add_u32_e64 v5, v5, s7
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v6, v6
+; GFX942-O0-NEXT:    v_min_u32_e64 v6, v5, v6
+; GFX942-O0-NEXT:    s_mov_b32 s6, 0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, s6
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v4, v4
+; GFX942-O0-NEXT:    v_add_u32_e64 v4, v4, s7
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v8, v8
+; GFX942-O0-NEXT:    v_min_u32_e64 v8, v4, v8
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s6
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v4
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], 64
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[8:9], v[8:9], 0, s[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v9
+; GFX942-O0-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[8:9], v[10:11], s[8:9]
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[8:9]
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v8, v5, v6, s[8:9]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v4
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v4, v0
+; GFX942-O0-NEXT:    v_add_u32_e64 v4, v4, s7
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v5, v1
+; GFX942-O0-NEXT:    v_min_u32_e64 v6, v4, v5
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s6
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v4, v2
+; GFX942-O0-NEXT:    v_add_u32_e64 v4, v4, s7
+; GFX942-O0-NEXT:    v_ffbh_u32_e64 v10, v3
+; GFX942-O0-NEXT:    v_min_u32_e64 v10, v4, v10
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s6
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[10:11], v[10:11], 0, s[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v11
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], s[2:3]
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[4:5], v[12:13], s[4:5]
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[4:5]
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v10, v5, v6, s[4:5]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v9
+; GFX942-O0-NEXT:    s_mov_b32 s6, s2
+; GFX942-O0-NEXT:    s_mov_b32 s7, s3
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v7
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v6, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, s6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, s6
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v8, vcc, v5, v6, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, s7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, s7
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v6, vcc, v5, v6, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a17, v5 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a18, v4 ; Reload Reuse
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v6
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a19, v9 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a20, v8 ; Reload Reuse
+; GFX942-O0-NEXT:    v_cmp_eq_u64_e64 s[4:5], v[8:9], s[2:3]
+; GFX942-O0-NEXT:    s_mov_b64 s[8:9], 0x7f
+; GFX942-O0-NEXT:    v_cmp_gt_u64_e64 s[10:11], v[4:5], s[8:9]
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v7, 0, 1, s[10:11]
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[10:11], v[8:9], s[2:3]
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, 0, 1, s[10:11]
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX942-O0-NEXT:    v_and_b32_e64 v6, 1, v6
+; GFX942-O0-NEXT:    v_cmp_eq_u32_e64 s[4:5], v6, 1
+; GFX942-O0-NEXT:    s_or_b64 s[4:5], s[0:1], s[4:5]
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], -1
+; GFX942-O0-NEXT:    s_xor_b64 s[0:1], s[4:5], s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX942-O0-NEXT:    s_mov_b32 s10, s9
+; GFX942-O0-NEXT:    v_xor_b32_e64 v6, v6, s10
+; GFX942-O0-NEXT:    ; kill: def $sgpr8 killed $sgpr8 killed $sgpr8_sgpr9
+; GFX942-O0-NEXT:    v_xor_b32_e64 v4, v4, s8
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v9
+; GFX942-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v6
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[2:3], v[4:5], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s7
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v1, v4, s[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v0, v0, v1, s[4:5]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr8
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s7
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v3, v4, s[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[4:5]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr4
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-O0-NEXT:    s_and_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a21, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a22, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a23, v1 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a24, v0 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], exec
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s0, 2
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s1, 3
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a25, v29 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-O0-NEXT:    s_cbranch_execz .LBB1_3
+; GFX942-O0-NEXT:    s_branch .LBB1_8
+; GFX942-O0-NEXT:  .LBB1_1: ; %Flow
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a25 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    v_readlane_b32 s0, v29, 4
+; GFX942-O0-NEXT:    v_readlane_b32 s1, v29, 5
+; GFX942-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-O0-NEXT:  ; %bb.2: ; %Flow
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v7, a26 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v6, a27 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v5, a28 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v4, a29 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v3, a30 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v2, a31 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:32 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:24 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:16 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:8 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_branch .LBB1_5
+; GFX942-O0-NEXT:  .LBB1_3: ; %Flow2
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a25 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    v_readlane_b32 s0, v29, 2
+; GFX942-O0-NEXT:    v_readlane_b32 s1, v29, 3
+; GFX942-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v1, a21 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v0, a22 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v3, a23 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v2, a24 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:48 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:40 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_branch .LBB1_9
+; GFX942-O0-NEXT:  .LBB1_4: ; %udiv-loop-exit
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[6:7], off, s32 offset:56 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[8:9], off, s32 offset:64 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:72 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[4:5], off, s32 offset:80 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    s_mov_b32 s0, 1
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[2:3], s0, v[0:1]
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[10:11], s0, v[4:5]
+; GFX942-O0-NEXT:    s_mov_b32 s0, 63
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[0:1], s0, v[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v9
+; GFX942-O0-NEXT:    v_or3_b32 v4, v4, v5, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v8
+; GFX942-O0-NEXT:    v_or3_b32 v0, v0, v1, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v7
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v6
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a21, v3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a22, v2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a23, v1 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a24, v0 ; Reload Reuse
+; GFX942-O0-NEXT:    s_branch .LBB1_3
+; GFX942-O0-NEXT:  .LBB1_5: ; %Flow1
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a25 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    v_readlane_b32 s0, v29, 6
+; GFX942-O0-NEXT:    v_readlane_b32 s1, v29, 7
+; GFX942-O0-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:24 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[2:3], off, s32 offset:16 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[4:5], off, s32 offset:8 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[6:7], off, s32 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:64 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:56 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:80 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:72 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_branch .LBB1_4
+; GFX942-O0-NEXT:  .LBB1_6: ; %udiv-do-while
+; GFX942-O0-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a25 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    v_readlane_b32 s2, v29, 8
+; GFX942-O0-NEXT:    v_readlane_b32 s3, v29, 9
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[6:7], off, s32 offset:88 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:96 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[26:27], off, s32 offset:104 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[2:3], off, s32 offset:112 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[16:17], off, s32 offset:120 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[8:9], off, s32 offset:128 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[24:25], off, s32 offset:136 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[10:11], off, s32 offset:144 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v19, a11 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v18, a12 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v21, a9 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v20, a10 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[14:15], off, s32 offset:152 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[22:23], off, s32 offset:160 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    s_mov_b32 s0, 63
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(6)
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[12:13], s0, v[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX942-O0-NEXT:    s_mov_b32 s1, 1
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[26:27], s1, v[26:27]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v27
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 killed $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v26
+; GFX942-O0-NEXT:    v_or_b32_e64 v12, v5, v12
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v4
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[26:27], s1, v[2:3]
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[4:5], s0, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v27
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v5
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v26
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v3, v4
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[2:3], s1, v[0:1]
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[26:27], s1, v[6:7]
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[0:1], s0, v[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v27
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v28, v25
+; GFX942-O0-NEXT:    v_or3_b32 v6, v6, v7, v28
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v26
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v24
+; GFX942-O0-NEXT:    v_or3_b32 v0, v0, v1, v7
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v3
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(2)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v11
+; GFX942-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v10
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v13
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v22
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v23
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v13, vcc, v13, v6
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v12, v10, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v12, vcc, v11, v4, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v5, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v7
+; GFX942-O0-NEXT:    v_ashrrev_i64 v[14:15], s0, v[12:13]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v15
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], 1
+; GFX942-O0-NEXT:    s_mov_b32 s4, s1
+; GFX942-O0-NEXT:    v_and_b32_e64 v12, v7, s4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v14
+; GFX942-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX942-O0-NEXT:    v_and_b32_e64 v14, v11, s0
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[12:13], 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v21
+; GFX942-O0-NEXT:    v_and_b32_e64 v22, v7, v22
+; GFX942-O0-NEXT:    v_and_b32_e64 v20, v11, v20
+; GFX942-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v21, v22
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v19
+; GFX942-O0-NEXT:    v_and_b32_e64 v7, v7, v22
+; GFX942-O0-NEXT:    v_and_b32_e64 v22, v11, v18
+; GFX942-O0-NEXT:    ; kill: def $vgpr22 killed $vgpr22 def $vgpr22_vgpr23 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v23, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v22
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v23
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v20
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v21
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v19
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v10, vcc, v10, v18, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v11, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v7, vcc, v5, v7, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v8
+; GFX942-O0-NEXT:    ; kill: def $vgpr9 killed $vgpr9 killed $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], -1
+; GFX942-O0-NEXT:    s_mov_b32 s1, s4
+; GFX942-O0-NEXT:    s_mov_b32 s0, s5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v8, v17
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, s1
+; GFX942-O0-NEXT:    v_add_co_u32_e32 v20, vcc, v11, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, s0
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v9, vcc, v9, v11, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, s1
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v16, vcc, v10, v11, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, s0
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v8, v10, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr20 killed $vgpr20 def $vgpr20_vgpr21 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v21, v9
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v8
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[8:9], v[16:17]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[10:11], v[20:21]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v17
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v21
+; GFX942-O0-NEXT:    v_or_b32_e64 v18, v18, v19
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, v20
+; GFX942-O0-NEXT:    v_or_b32_e64 v16, v16, v17
+; GFX942-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v18
+; GFX942-O0-NEXT:    v_cmp_eq_u64_e64 s[0:1], v[16:17], v[12:13]
+; GFX942-O0-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[16:17], v[2:3]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a26, v17 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a27, v16 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[16:17], v[0:1]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a28, v17 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a29, v16 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[16:17], v[14:15]
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a30, v17 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a31, v16 ; Reload Reuse
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[16:17], v[12:13]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[16:17], s32 offset:32 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s2, 4
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s3, 5
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], s[0:1]
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s2, 8
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s3, 9
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a25, v29 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[14:15], s32 offset:144 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[12:13], s32 offset:136 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[10:11], s32 offset:128 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[8:9], s32 offset:120 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:112 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:104 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:96 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:88 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX942-O0-NEXT:    s_cbranch_execnz .LBB1_6
+; GFX942-O0-NEXT:    s_branch .LBB1_1
+; GFX942-O0-NEXT:  .LBB1_7: ; %udiv-preheader
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a25 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:168 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[2:3], off, s32 offset:176 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[8:9], off, s32 offset:184 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[10:11], off, s32 offset:192 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v17, a9 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v16, a10 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v13, a11 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v12, a12 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v15, a13 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v14, a14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v19, a15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v18, a16 ; Reload Reuse
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v10
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[6:7], v4, v[18:19]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    s_mov_b32 s0, 64
+; GFX942-O0-NEXT:    v_sub_u32_e64 v20, s0, v4
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[20:21], v20, v[14:15]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v21
+; GFX942-O0-NEXT:    v_or_b32_e64 v5, v5, v22
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v20
+; GFX942-O0-NEXT:    v_or_b32_e64 v6, v6, v7
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v7
+; GFX942-O0-NEXT:    v_cmp_lt_u32_e64 s[2:3], v4, s0
+; GFX942-O0-NEXT:    v_sub_u32_e64 v5, v4, s0
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[20:21], v5, v[14:15]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v21
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v22, s[2:3]
+; GFX942-O0-NEXT:    s_mov_b32 s0, 0
+; GFX942-O0-NEXT:    v_cmp_eq_u32_e64 s[0:1], v4, s0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v22, v19
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v5, v5, v22, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v20
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v18
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[0:1]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[4:5], v4, v[14:15]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v5
+; GFX942-O0-NEXT:    s_mov_b64 s[0:1], 0
+; GFX942-O0-NEXT:    s_mov_b32 s4, s1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, s4
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-O0-NEXT:    s_mov_b32 s4, s0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[2:3]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v13
+; GFX942-O0-NEXT:    s_mov_b64 s[4:5], -1
+; GFX942-O0-NEXT:    s_mov_b32 s3, s4
+; GFX942-O0-NEXT:    s_mov_b32 s2, s5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v17
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, s3
+; GFX942-O0-NEXT:    v_add_co_u32_e32 v16, vcc, v15, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, s2
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v12, vcc, v12, v15, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, s3
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v18, vcc, v14, v15, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, s2
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v14, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v13
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v12
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[12:13], s[0:1]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[14:15], s[0:1]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[18:19], s32 offset:152 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[16:17], s32 offset:160 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s0, 8
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s1, 9
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a25, v29 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[14:15], s32 offset:144 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[12:13], s32 offset:136 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[10:11], s32 offset:128 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[8:9], s32 offset:120 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:112 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:104 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:96 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 offset:88 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_branch .LBB1_6
+; GFX942-O0-NEXT:  .LBB1_8: ; %udiv-bb1
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v29, a25 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v7, a15 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v6, a16 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v11, a13 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v10, a14 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v5, a19 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v4, a20 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v1, a17 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v0, a18 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], 1
+; GFX942-O0-NEXT:    s_mov_b32 s1, s2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-O0-NEXT:    s_mov_b32 s0, s3
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-O0-NEXT:    s_mov_b32 s4, s2
+; GFX942-O0-NEXT:    s_mov_b32 s5, s3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-O0-NEXT:    v_add_co_u32_e32 v8, vcc, v3, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s0
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v0, vcc, v0, v4, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s5
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v2, v4, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[4:5], v[0:1]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:184 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[4:5], v[8:9]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:192 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_mov_b32 s0, 0x7f
+; GFX942-O0-NEXT:    v_sub_u32_e64 v2, s0, v3
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[4:5], v2, v[10:11]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v5
+; GFX942-O0-NEXT:    s_mov_b32 s0, 64
+; GFX942-O0-NEXT:    v_sub_u32_e64 v13, s0, v2
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[14:15], v13, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v15
+; GFX942-O0-NEXT:    v_or_b32_e64 v12, v12, v13
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v14
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v5
+; GFX942-O0-NEXT:    v_cmp_lt_u32_e64 s[0:1], v2, s0
+; GFX942-O0-NEXT:    s_mov_b32 s6, 63
+; GFX942-O0-NEXT:    v_sub_u32_e64 v3, s6, v3
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[12:13], v3, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v13
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[0:1]
+; GFX942-O0-NEXT:    s_mov_b32 s6, 0
+; GFX942-O0-NEXT:    v_cmp_eq_u32_e64 s[6:7], v2, s6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v11
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v3, v3, v14, s[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v12
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[6:7]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr6
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[6:7], v2, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, s5
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, s4
+; GFX942-O0-NEXT:    v_cndmask_b32_e64 v6, v3, v6, s[0:1]
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:176 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:168 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v8
+; GFX942-O0-NEXT:    v_or_b32_e64 v0, v0, v1
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-O0-NEXT:    v_cmp_ne_u64_e64 s[0:1], v[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[6:7], s32 offset:24 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[4:5], s32 offset:16 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:8 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    scratch_store_dwordx2 off, v[0:1], s32 ; 8-byte Folded Spill
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-O0-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX942-O0-NEXT:    s_xor_b64 s[2:3], s[0:1], s[2:3]
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s2, 6
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_writelane_b32 v29, s3, 7
+; GFX942-O0-NEXT:    s_or_saveexec_b64 s[14:15], -1
+; GFX942-O0-NEXT:    v_accvgpr_write_b32 a25, v29 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[14:15]
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-O0-NEXT:    s_cbranch_execz .LBB1_5
+; GFX942-O0-NEXT:    s_branch .LBB1_7
+; GFX942-O0-NEXT:  .LBB1_9: ; %udiv-end
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v9, a1 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v8, a2 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v1, a3 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v0, a4 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v15, a7 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v14, a8 ; Reload Reuse
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[12:13], off, s32 offset:48 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    scratch_load_dwordx2 v[6:7], off, s32 offset:40 ; 8-byte Folded Reload
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v11, a5 ; Reload Reuse
+; GFX942-O0-NEXT:    v_accvgpr_read_b32 v10, a6 ; Reload Reuse
+; GFX942-O0-NEXT:    s_mov_b32 s0, 32
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[2:3], s0, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v10
+; GFX942-O0-NEXT:    v_mul_lo_u32 v4, v5, v2
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[10:11], s0, v[10:11]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v10, v6
+; GFX942-O0-NEXT:    v_mul_lo_u32 v3, v10, v3
+; GFX942-O0-NEXT:    v_mad_u64_u32 v[6:7], s[2:3], v10, v2, 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v7
+; GFX942-O0-NEXT:    v_add3_u32 v2, v2, v3, v4
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[16:17], s0, v[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v17
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    s_mov_b32 s1, 0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v7
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v6
+; GFX942-O0-NEXT:    v_or_b32_e64 v6, v3, v4
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v2
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[2:3], s0, v[14:15]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v12
+; GFX942-O0-NEXT:    v_mul_lo_u32 v3, v2, v11
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[12:13], s0, v[12:13]
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 killed $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v14
+; GFX942-O0-NEXT:    v_mul_lo_u32 v12, v12, v4
+; GFX942-O0-NEXT:    v_mad_u64_u32 v[14:15], s[2:3], v2, v4, 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v15
+; GFX942-O0-NEXT:    v_add3_u32 v2, v2, v3, v12
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, s2
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v12
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[2:3], s0, v[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v3
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, s1
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v15
+; GFX942-O0-NEXT:    v_or_b32_e64 v12, v12, v13
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v14
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v12
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[6:7], v[2:3], 0, v[6:7]
+; GFX942-O0-NEXT:    v_mad_u64_u32 v[14:15], s[2:3], v11, v5, 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v14
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, s1
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v12
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v15
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, s2
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v13
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[14:15], s0, v[14:15]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v15
+; GFX942-O0-NEXT:    v_or_b32_e64 v12, v12, v13
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v14
+; GFX942-O0-NEXT:    v_or_b32_e64 v2, v2, v3
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v12
+; GFX942-O0-NEXT:    v_mad_u64_u32 v[14:15], s[2:3], v11, v10, 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v12, v14
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, s1
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v13
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v15
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, s2
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[14:15], s0, v[14:15]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, v15
+; GFX942-O0-NEXT:    v_or_b32_e64 v11, v11, v16
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 killed $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v14
+; GFX942-O0-NEXT:    v_or_b32_e64 v12, v12, v13
+; GFX942-O0-NEXT:    ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v13, v11
+; GFX942-O0-NEXT:    v_mad_u64_u32 v[10:11], s[2:3], v4, v10, 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v11
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, s1
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[12:13], v[12:13], 0, v[14:15]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v13
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], 0xffffffff
+; GFX942-O0-NEXT:    s_mov_b32 s4, s3
+; GFX942-O0-NEXT:    v_and_b32_e64 v16, v14, s4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v14, v12
+; GFX942-O0-NEXT:    ; kill: def $sgpr2 killed $sgpr2 killed $sgpr2_sgpr3
+; GFX942-O0-NEXT:    v_and_b32_e64 v14, v14, s2
+; GFX942-O0-NEXT:    ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v15, v16
+; GFX942-O0-NEXT:    v_mad_u64_u32 v[18:19], s[2:3], v4, v5, 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v18
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, s1
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX942-O0-NEXT:    v_mov_b32_e32 v16, v5
+; GFX942-O0-NEXT:    v_mov_b32_e32 v18, v19
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, s2
+; GFX942-O0-NEXT:    ; kill: def $vgpr18 killed $vgpr18 def $vgpr18_vgpr19 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v19, v17
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[18:19], s0, v[18:19]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v17, v19
+; GFX942-O0-NEXT:    v_or_b32_e64 v16, v16, v17
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v18
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v16
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[14:15]
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[14:15], s0, v[4:5]
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[12:13], s0, v[12:13]
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[12:13], v[12:13], 0, v[14:15]
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[12:13]
+; GFX942-O0-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[6:7]
+; GFX942-O0-NEXT:    v_lshlrev_b64 v[6:7], s0, v[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v11
+; GFX942-O0-NEXT:    v_or_b32_e64 v4, v4, v5
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX942-O0-NEXT:    v_or_b32_e64 v10, v5, v6
+; GFX942-O0-NEXT:    ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v11, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v10
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v11
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v9
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v7
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v5, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v3, v4, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr1
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[6:7], s0, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[4:5], s0, v[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-O0-NEXT:    s_xor_saveexec_b64 s[0:1], -1
+; GFX942-O0-NEXT:    scratch_load_dword v29, off, s32 offset:200 ; 4-byte Folded Reload
+; GFX942-O0-NEXT:    s_mov_b64 exec, s[0:1]
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-O0-NEXT:    s_setpc_b64 s[30:31]
   %div = urem i128 %lhs, %rhs
   ret i128 %div
 }
@@ -2937,6 +5469,114 @@ define i128 @v_srem_i128_v_pow2k(i128 %lhs) {
 ; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s4, v[3:4]
 ; GFX9-O0-NEXT:    ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_srem_i128_v_pow2k:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX942-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-NEXT:    v_lshrrev_b64 v[4:5], 31, v[4:5]
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v0, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v5, vcc
+; GFX942-NEXT:    v_and_b32_e32 v4, -2, v4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v2, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v6, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_subrev_co_u32_e32 v0, vcc, 0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v5, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-O0-LABEL: v_srem_i128_v_pow2k:
+; GFX942-O0:       ; %bb.0:
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v6, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-O0-NEXT:    s_mov_b32 s0, 63
+; GFX942-O0-NEXT:    v_ashrrev_i64 v[4:5], s0, v[4:5]
+; GFX942-O0-NEXT:    s_mov_b32 s0, 31
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[8:9], s0, v[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX942-O0-NEXT:    s_mov_b64 s[2:3], 0
+; GFX942-O0-NEXT:    s_mov_b32 s1, s2
+; GFX942-O0-NEXT:    s_mov_b32 s0, s3
+; GFX942-O0-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v7, vcc, v0, v2, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, s1
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v8, vcc, v3, v2, vcc
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-O0-NEXT:    s_nop 0
+; GFX942-O0-NEXT:    v_addc_co_u32_e32 v2, vcc, v1, v2, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v5
+; GFX942-O0-NEXT:    s_mov_b32 s2, -2
+; GFX942-O0-NEXT:    s_mov_b32 s0, 0
+; GFX942-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; GFX942-O0-NEXT:    s_mov_b32 s1, s2
+; GFX942-O0-NEXT:    s_mov_b32 s2, s1
+; GFX942-O0-NEXT:    v_and_b32_e64 v7, v7, s2
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX942-O0-NEXT:    v_and_b32_e64 v4, v4, s0
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v7
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr5 killed $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v9, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX942-O0-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v7
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v0, vcc, v0, v5, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v4, vcc, v3, v4, vcc
+; GFX942-O0-NEXT:    s_nop 1
+; GFX942-O0-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v5, v1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v7, v0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v6
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v4
+; GFX942-O0-NEXT:    s_mov_b32 s0, 32
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[6:7], s0, v[6:7]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[4:5], s0, v[4:5]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-O0-NEXT:    s_setpc_b64 s[30:31]
   %div = srem i128 %lhs, 8589934592
   ret i128 %div
 }
@@ -2986,6 +5626,48 @@ define i128 @v_urem_i128_v_pow2k(i128 %lhs) {
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v3
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_urem_i128_v_pow2k:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_and_b32_e32 v1, 1, v1
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-O0-LABEL: v_urem_i128_v_pow2k:
+; GFX942-O0:       ; %bb.0:
+; GFX942-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-O0-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX942-O0-NEXT:    ; kill: def $vgpr5 killed $vgpr3 killed $exec
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    ; implicit-def: $sgpr0_sgpr1
+; GFX942-O0-NEXT:    s_mov_b32 s2, 1
+; GFX942-O0-NEXT:    s_mov_b32 s0, -1
+; GFX942-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1
+; GFX942-O0-NEXT:    s_mov_b32 s1, s2
+; GFX942-O0-NEXT:    s_mov_b32 s2, s1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v3
+; GFX942-O0-NEXT:    v_and_b32_e64 v1, v0, s2
+; GFX942-O0-NEXT:    ; kill: def $sgpr0 killed $sgpr0 killed $sgpr0_sgpr1
+; GFX942-O0-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-O0-NEXT:    v_and_b32_e64 v2, v0, s0
+; GFX942-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-O0-NEXT:    s_mov_b32 s0, 32
+; GFX942-O0-NEXT:    v_lshrrev_b64 v[2:3], s0, v[2:3]
+; GFX942-O0-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-O0-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-O0-NEXT:    v_mov_b32_e32 v2, v3
+; GFX942-O0-NEXT:    s_setpc_b64 s[30:31]
   %div = urem i128 %lhs, 8589934592
   ret i128 %div
 }
@@ -2993,3 +5675,5 @@ define i128 @v_urem_i128_v_pow2k(i128 %lhs) {
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX9-SDAG: {{.*}}
 ; GFX9-SDAG-O0: {{.*}}
+; GFX942-SDAG: {{.*}}
+; GFX942-SDAG-O0: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 4addf42b27984..7391a909ff5eb 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GCN-IR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GFX942-IR %s
 
 define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_sdiv:
@@ -223,6 +225,244 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_sdiv:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s2, s1, 31
+; GFX942-NEXT:    s_add_u32 s0, s0, s2
+; GFX942-NEXT:    s_mov_b32 s3, s2
+; GFX942-NEXT:    s_addc_u32 s1, s1, s2
+; GFX942-NEXT:    s_xor_b64 s[6:7], s[0:1], s[2:3]
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GFX942-NEXT:    s_sub_u32 s0, 0, s6
+; GFX942-NEXT:    s_subb_u32 s1, 0, s7
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_fmamk_f32 v1, v2, 0xcf800000, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-NEXT:    s_mul_i32 s12, s0, s4
+; GFX942-NEXT:    s_mul_hi_u32 s14, s0, s5
+; GFX942-NEXT:    s_mul_i32 s13, s1, s5
+; GFX942-NEXT:    s_add_i32 s12, s14, s12
+; GFX942-NEXT:    s_mul_i32 s15, s0, s5
+; GFX942-NEXT:    s_add_i32 s12, s12, s13
+; GFX942-NEXT:    s_mul_hi_u32 s14, s5, s15
+; GFX942-NEXT:    s_mul_hi_u32 s13, s5, s12
+; GFX942-NEXT:    s_mul_i32 s5, s5, s12
+; GFX942-NEXT:    s_add_u32 s5, s14, s5
+; GFX942-NEXT:    s_addc_u32 s13, 0, s13
+; GFX942-NEXT:    s_mul_hi_u32 s16, s4, s15
+; GFX942-NEXT:    s_mul_i32 s15, s4, s15
+; GFX942-NEXT:    s_add_u32 s5, s5, s15
+; GFX942-NEXT:    s_mul_hi_u32 s14, s4, s12
+; GFX942-NEXT:    s_addc_u32 s5, s13, s16
+; GFX942-NEXT:    s_addc_u32 s13, s14, 0
+; GFX942-NEXT:    s_mul_i32 s12, s4, s12
+; GFX942-NEXT:    s_add_u32 s5, s5, s12
+; GFX942-NEXT:    s_addc_u32 s12, 0, s13
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s5, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s4, s4, s12
+; GFX942-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX942-NEXT:    s_mul_i32 s5, s0, s4
+; GFX942-NEXT:    s_mul_hi_u32 s13, s0, s12
+; GFX942-NEXT:    s_add_i32 s5, s13, s5
+; GFX942-NEXT:    s_mul_i32 s1, s1, s12
+; GFX942-NEXT:    s_add_i32 s5, s5, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s12
+; GFX942-NEXT:    s_mul_hi_u32 s13, s4, s0
+; GFX942-NEXT:    s_mul_i32 s14, s4, s0
+; GFX942-NEXT:    s_mul_i32 s16, s12, s5
+; GFX942-NEXT:    s_mul_hi_u32 s0, s12, s0
+; GFX942-NEXT:    s_mul_hi_u32 s15, s12, s5
+; GFX942-NEXT:    s_add_u32 s0, s0, s16
+; GFX942-NEXT:    s_addc_u32 s12, 0, s15
+; GFX942-NEXT:    s_add_u32 s0, s0, s14
+; GFX942-NEXT:    s_mul_hi_u32 s1, s4, s5
+; GFX942-NEXT:    s_addc_u32 s0, s12, s13
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s5, s4, s5
+; GFX942-NEXT:    s_add_u32 s0, s0, s5
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s12, s4, s1
+; GFX942-NEXT:    s_ashr_i32 s4, s11, 31
+; GFX942-NEXT:    s_add_u32 s0, s10, s4
+; GFX942-NEXT:    s_mov_b32 s5, s4
+; GFX942-NEXT:    s_addc_u32 s1, s11, s4
+; GFX942-NEXT:    s_xor_b64 s[10:11], s[0:1], s[4:5]
+; GFX942-NEXT:    v_readfirstlane_b32 s13, v1
+; GFX942-NEXT:    s_mul_i32 s1, s10, s12
+; GFX942-NEXT:    s_mul_hi_u32 s14, s10, s13
+; GFX942-NEXT:    s_mul_hi_u32 s0, s10, s12
+; GFX942-NEXT:    s_add_u32 s1, s14, s1
+; GFX942-NEXT:    s_addc_u32 s0, 0, s0
+; GFX942-NEXT:    s_mul_hi_u32 s15, s11, s13
+; GFX942-NEXT:    s_mul_i32 s13, s11, s13
+; GFX942-NEXT:    s_add_u32 s1, s1, s13
+; GFX942-NEXT:    s_mul_hi_u32 s14, s11, s12
+; GFX942-NEXT:    s_addc_u32 s0, s0, s15
+; GFX942-NEXT:    s_addc_u32 s1, s14, 0
+; GFX942-NEXT:    s_mul_i32 s12, s11, s12
+; GFX942-NEXT:    s_add_u32 s12, s0, s12
+; GFX942-NEXT:    s_addc_u32 s13, 0, s1
+; GFX942-NEXT:    s_mul_i32 s0, s6, s13
+; GFX942-NEXT:    s_mul_hi_u32 s1, s6, s12
+; GFX942-NEXT:    s_add_i32 s0, s1, s0
+; GFX942-NEXT:    s_mul_i32 s1, s7, s12
+; GFX942-NEXT:    s_add_i32 s14, s0, s1
+; GFX942-NEXT:    s_mul_i32 s1, s6, s12
+; GFX942-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NEXT:    s_sub_i32 s0, s11, s14
+; GFX942-NEXT:    v_sub_co_u32_e32 v1, vcc, s10, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s10, s0, s7
+; GFX942-NEXT:    v_subrev_co_u32_e64 v2, s[0:1], s6, v1
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s10, s10, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s10, s7
+; GFX942-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[0:1], s6, v2
+; GFX942-NEXT:    s_cmp_eq_u32 s10, s7
+; GFX942-NEXT:    v_mov_b32_e32 v3, s15
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s12, 1
+; GFX942-NEXT:    s_addc_u32 s10, s13, 0
+; GFX942-NEXT:    s_add_u32 s1, s12, 2
+; GFX942-NEXT:    s_addc_u32 s15, s13, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s0
+; GFX942-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s6, v1
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v3, s10
+; GFX942-NEXT:    v_mov_b32_e32 v4, s15
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s11, s14
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s7
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, s13
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX942-NEXT:    s_xor_b64 s[0:1], s[4:5], s[2:3]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, s12
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v2, s0, v2
+; GFX942-NEXT:    v_xor_b32_e32 v1, s1, v1
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_subrev_co_u32_e32 v2, vcc, s0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_sdiv:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_load_dwordx2 s[8:9], s[4:5], 0x34
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX942-IR-NEXT:    s_mov_b32 s5, s4
+; GFX942-IR-NEXT:    s_ashr_i32 s6, s9, 31
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX942-IR-NEXT:    s_mov_b32 s7, s6
+; GFX942-IR-NEXT:    s_sub_u32 s12, s2, s4
+; GFX942-IR-NEXT:    s_subb_u32 s13, s3, s4
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], s[8:9], s[6:7]
+; GFX942-IR-NEXT:    s_sub_u32 s2, s2, s6
+; GFX942-IR-NEXT:    s_subb_u32 s3, s3, s6
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[12:13], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX942-IR-NEXT:    s_or_b64 s[16:17], s[10:11], s[14:15]
+; GFX942-IR-NEXT:    s_flbit_i32_b64 s14, s[2:3]
+; GFX942-IR-NEXT:    s_flbit_i32_b64 s20, s[12:13]
+; GFX942-IR-NEXT:    s_sub_u32 s10, s14, s20
+; GFX942-IR-NEXT:    s_subb_u32 s11, 0, 0
+; GFX942-IR-NEXT:    v_cmp_gt_u64_e64 s[18:19], s[10:11], 63
+; GFX942-IR-NEXT:    s_or_b64 s[18:19], s[16:17], s[18:19]
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[10:11], 63
+; GFX942-IR-NEXT:    s_cselect_b64 s[22:23], -1, 0
+; GFX942-IR-NEXT:    s_and_b64 s[16:17], s[18:19], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s17, 0, s13
+; GFX942-IR-NEXT:    s_cselect_b32 s16, 0, s12
+; GFX942-IR-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX942-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
+; GFX942-IR-NEXT:    s_mov_b32 s15, 0
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB0_5
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    s_add_u32 s16, s10, 1
+; GFX942-IR-NEXT:    s_addc_u32 s17, s11, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[16:17], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
+; GFX942-IR-NEXT:    s_sub_i32 s10, 63, s10
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[18:19]
+; GFX942-IR-NEXT:    s_lshl_b64 s[10:11], s[12:13], s10
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB0_4
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    s_lshr_b64 s[16:17], s[12:13], s16
+; GFX942-IR-NEXT:    s_add_u32 s18, s2, -1
+; GFX942-IR-NEXT:    s_addc_u32 s19, s3, -1
+; GFX942-IR-NEXT:    s_not_b64 s[8:9], s[14:15]
+; GFX942-IR-NEXT:    s_add_u32 s12, s8, s20
+; GFX942-IR-NEXT:    s_addc_u32 s13, s9, 0
+; GFX942-IR-NEXT:    s_mov_b64 s[14:15], 0
+; GFX942-IR-NEXT:    s_mov_b32 s9, 0
+; GFX942-IR-NEXT:  .LBB0_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    s_lshl_b64 s[16:17], s[16:17], 1
+; GFX942-IR-NEXT:    s_lshr_b32 s8, s11, 31
+; GFX942-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
+; GFX942-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[8:9]
+; GFX942-IR-NEXT:    s_or_b64 s[10:11], s[14:15], s[10:11]
+; GFX942-IR-NEXT:    s_sub_u32 s8, s18, s16
+; GFX942-IR-NEXT:    s_subb_u32 s8, s19, s17
+; GFX942-IR-NEXT:    s_ashr_i32 s14, s8, 31
+; GFX942-IR-NEXT:    s_mov_b32 s15, s14
+; GFX942-IR-NEXT:    s_and_b32 s8, s14, 1
+; GFX942-IR-NEXT:    s_and_b64 s[14:15], s[14:15], s[2:3]
+; GFX942-IR-NEXT:    s_sub_u32 s16, s16, s14
+; GFX942-IR-NEXT:    s_subb_u32 s17, s17, s15
+; GFX942-IR-NEXT:    s_add_u32 s12, s12, 1
+; GFX942-IR-NEXT:    s_addc_u32 s13, s13, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[12:13], 0
+; GFX942-IR-NEXT:    s_mov_b64 s[14:15], s[8:9]
+; GFX942-IR-NEXT:    s_cbranch_scc0 .LBB0_3
+; GFX942-IR-NEXT:  .LBB0_4: ; %Flow7
+; GFX942-IR-NEXT:    s_lshl_b64 s[2:3], s[10:11], 1
+; GFX942-IR-NEXT:    s_or_b64 s[16:17], s[8:9], s[2:3]
+; GFX942-IR-NEXT:  .LBB0_5: ; %udiv-end
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], s[6:7], s[4:5]
+; GFX942-IR-NEXT:    s_xor_b64 s[4:5], s[16:17], s[2:3]
+; GFX942-IR-NEXT:    s_sub_u32 s2, s4, s2
+; GFX942-IR-NEXT:    s_subb_u32 s3, s5, s3
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-IR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %result = sdiv i64 %x, %y
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -452,6 +692,227 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v3, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_sdiv:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX942-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942-NEXT:    v_xor_b32_e32 v5, v3, v4
+; GFX942-NEXT:    v_xor_b32_e32 v14, v2, v4
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, v14
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, v5
+; GFX942-NEXT:    v_sub_co_u32_e32 v13, vcc, 0, v14
+; GFX942-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-NEXT:    v_fmamk_f32 v2, v3, 0x4f800000, v2
+; GFX942-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX942-NEXT:    v_subb_co_u32_e32 v15, vcc, 0, v5, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX942-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fmamk_f32 v2, v3, 0xcf800000, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v12, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v16, v3
+; GFX942-NEXT:    v_mul_lo_u32 v6, v15, v12
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v13, v12, 0
+; GFX942-NEXT:    v_mul_lo_u32 v7, v13, v16
+; GFX942-NEXT:    v_add3_u32 v3, v3, v7, v6
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v12, v3, 0
+; GFX942-NEXT:    v_mul_hi_u32 v8, v12, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[6:7], v[8:9], 0, v[6:7]
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v16, v3, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v16, v2, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v8, vcc, v7, v3, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[8:9], 0, v[10:11]
+; GFX942-NEXT:    v_add_co_u32_e32 v17, vcc, v12, v2
+; GFX942-NEXT:    v_mul_lo_u32 v7, v15, v17
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v16, vcc, v16, v3, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v6, v13, v16
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v13, v17, 0
+; GFX942-NEXT:    v_add3_u32 v3, v3, v6, v7
+; GFX942-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v17, v3, 0
+; GFX942-NEXT:    v_mul_hi_u32 v8, v17, v2
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v16, v3, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v16, v2, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[8:9], 0, v[12:13]
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v8, vcc, v3, v11, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[8:9], 0, v[6:7]
+; GFX942-NEXT:    v_add_co_u32_e32 v10, vcc, v17, v2
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v6, vcc, v16, v3, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT:    v_xor_b32_e32 v12, v0, v2
+; GFX942-NEXT:    v_xor_b32_e32 v3, v1, v2
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v12, v6, 0
+; GFX942-NEXT:    v_mul_hi_u32 v8, v12, v10
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[8:9], 0, v[0:1]
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v3, v10, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v10
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v3, v6, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v8, vcc, v1, v11, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v2, v2, v4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[8:9], 0, v[6:7]
+; GFX942-NEXT:    v_mul_lo_u32 v8, v5, v0
+; GFX942-NEXT:    v_mul_lo_u32 v9, v14, v1
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v14, v0, 0
+; GFX942-NEXT:    v_add3_u32 v10, v7, v9, v8
+; GFX942-NEXT:    v_sub_u32_e32 v7, v3, v10
+; GFX942-NEXT:    v_sub_co_u32_e32 v11, vcc, v12, v6
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v7, v5, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v7, s[0:1], v11, v14
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v10, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[0:1], v6, v5
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v3, v5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[0:1], v7, v14
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], v6, v5
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v12, v8, v7, s[0:1]
+; GFX942-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, 2
+; GFX942-NEXT:    v_lshl_add_u64 v[8:9], v[0:1], 0, 1
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v12
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v11, v14
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v10, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v5
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v9, v10, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v8, v6, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v7, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX942-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_sdiv:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GFX942-IR-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
+; GFX942-IR-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v10, vcc, v0, v4
+; GFX942-IR-NEXT:    v_xor_b32_e32 v0, v2, v6
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v11, vcc, v1, v4, vcc
+; GFX942-IR-NEXT:    v_xor_b32_e32 v1, v3, v6
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v6
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GFX942-IR-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX942-IR-NEXT:    v_min_u32_e32 v14, v2, v3
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v2, v10
+; GFX942-IR-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v3, v11
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[10:11]
+; GFX942-IR-NEXT:    v_min_u32_e32 v12, v2, v3
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v16, vcc, v14, v12
+; GFX942-IR-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e64 v17, s[2:3], 0, 0, vcc
+; GFX942-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[16:17]
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[16:17]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v7, v6
+; GFX942-IR-NEXT:    v_mov_b32_e32 v13, 0
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v3, v11, 0, s[0:1]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v2, v10, 0, s[0:1]
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB1_6
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[18:19], v[16:17], 0, 1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v2, 63, v16
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[8:9], 0
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], v2, v[10:11]
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB1_5
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    v_mov_b32_e32 v9, -1
+; GFX942-IR-NEXT:    v_not_b32_e32 v8, v14
+; GFX942-IR-NEXT:    v_lshrrev_b64 v[16:17], v18, v[10:11]
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[10:11], v[0:1], 0, -1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[12:13], v[8:9], 0, v[12:13]
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[14:15], 0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-IR-NEXT:  .LBB1_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[16:17], 1, v[16:17]
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v8, 31, v3
+; GFX942-IR-NEXT:    v_or_b32_e32 v16, v16, v8
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v8, vcc, v10, v16
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v8, vcc, v11, v17, vcc
+; GFX942-IR-NEXT:    v_or_b32_e32 v2, v14, v2
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v8
+; GFX942-IR-NEXT:    v_or_b32_e32 v3, v15, v3
+; GFX942-IR-NEXT:    v_and_b32_e32 v8, 1, v14
+; GFX942-IR-NEXT:    v_and_b32_e32 v15, v14, v1
+; GFX942-IR-NEXT:    v_and_b32_e32 v14, v14, v0
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v16, vcc, v16, v14
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[12:13], v[12:13], 0, 1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v17, vcc, v17, v15, vcc
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[14:15], v[8:9]
+; GFX942-IR-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX942-IR-NEXT:  ; %bb.4: ; %Flow
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:  .LBB1_5: ; %Flow4
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[0:1], 1, v[2:3]
+; GFX942-IR-NEXT:    v_or_b32_e32 v3, v9, v1
+; GFX942-IR-NEXT:    v_or_b32_e32 v2, v8, v0
+; GFX942-IR-NEXT:  .LBB1_6: ; %Flow5
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-IR-NEXT:    v_xor_b32_e32 v0, v6, v4
+; GFX942-IR-NEXT:    v_xor_b32_e32 v1, v7, v5
+; GFX942-IR-NEXT:    v_xor_b32_e32 v2, v2, v0
+; GFX942-IR-NEXT:    v_xor_b32_e32 v3, v3, v1
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v0
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %x, %y
   ret i64 %result
 }
@@ -518,6 +979,60 @@ define amdgpu_kernel void @s_test_sdiv24_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_sdiv24_64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s2, s7, 8
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 8
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v1, s3
+; GFX942-NEXT:    s_xor_b32 s2, s3, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    s_or_b32 s4, s2, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v1, -v3, v0, v1
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-NEXT:    v_add_u32_e32 v0, s2, v3
+; GFX942-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_sdiv24_64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s7, 8
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s3, 8
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v1, s3
+; GFX942-IR-NEXT:    s_xor_b32 s2, s3, s2
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-IR-NEXT:    s_or_b32 s4, s2, 1
+; GFX942-IR-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-IR-NEXT:    v_fma_f32 v1, -v3, v0, v1
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-IR-NEXT:    v_add_u32_e32 v0, s2, v3
+; GFX942-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-IR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
   %result = sdiv i64 %1, %2
@@ -579,6 +1094,66 @@ define i64 @v_test_sdiv24_64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_sdiv24_64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, v0
+; GFX942-NEXT:    v_sub_u32_e32 v3, 0, v0
+; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_mul_lo_u32 v3, v3, v2
+; GFX942-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX942-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX942-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX942-NEXT:    v_mul_u32_u24_e32 v3, v2, v0
+; GFX942-NEXT:    v_sub_u32_e32 v1, v1, v3
+; GFX942-NEXT:    v_add_u32_e32 v4, 1, v2
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v0
+; GFX942-NEXT:    v_sub_u32_e32 v3, v1, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX942-NEXT:    v_add_u32_e32 v3, 1, v2
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_sdiv24_64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v2, v0
+; GFX942-IR-NEXT:    v_sub_u32_e32 v3, 0, v0
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_mul_lo_u32 v3, v3, v2
+; GFX942-IR-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX942-IR-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX942-IR-NEXT:    v_mul_hi_u32 v2, v1, v2
+; GFX942-IR-NEXT:    v_mul_u32_u24_e32 v3, v2, v0
+; GFX942-IR-NEXT:    v_sub_u32_e32 v1, v1, v3
+; GFX942-IR-NEXT:    v_add_u32_e32 v4, 1, v2
+; GFX942-IR-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v0
+; GFX942-IR-NEXT:    v_sub_u32_e32 v3, v1, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
+; GFX942-IR-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX942-IR-NEXT:    v_add_u32_e32 v3, 1, v2
+; GFX942-IR-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v0
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %1 = lshr i64 %x, 40
   %2 = lshr i64 %y, 40
   %result = sdiv i64 %1, %2
@@ -667,6 +1242,84 @@ define amdgpu_kernel void @s_test_sdiv32_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_sdiv32_64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x38
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_abs_i32 s7, s6
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX942-NEXT:    s_sub_i32 s4, 0, s7
+; GFX942-NEXT:    s_xor_b32 s2, s3, s6
+; GFX942-NEXT:    s_abs_i32 s3, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 31
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-NEXT:    s_mul_hi_u32 s4, s3, s5
+; GFX942-NEXT:    s_mul_i32 s5, s4, s7
+; GFX942-NEXT:    s_sub_i32 s3, s3, s5
+; GFX942-NEXT:    s_add_i32 s6, s4, 1
+; GFX942-NEXT:    s_sub_i32 s5, s3, s7
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s7
+; GFX942-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX942-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX942-NEXT:    s_add_i32 s5, s4, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s7
+; GFX942-NEXT:    s_cselect_b32 s3, s5, s4
+; GFX942-NEXT:    s_xor_b32 s3, s3, s2
+; GFX942-NEXT:    s_sub_i32 s2, s3, s2
+; GFX942-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_sdiv32_64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dword s6, s[4:5], 0x38
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_abs_i32 s7, s6
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX942-IR-NEXT:    s_sub_i32 s4, 0, s7
+; GFX942-IR-NEXT:    s_xor_b32 s2, s3, s6
+; GFX942-IR-NEXT:    s_abs_i32 s3, s3
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s2, 31
+; GFX942-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-IR-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-IR-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-IR-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-IR-NEXT:    s_mul_hi_u32 s4, s3, s5
+; GFX942-IR-NEXT:    s_mul_i32 s5, s4, s7
+; GFX942-IR-NEXT:    s_sub_i32 s3, s3, s5
+; GFX942-IR-NEXT:    s_add_i32 s6, s4, 1
+; GFX942-IR-NEXT:    s_sub_i32 s5, s3, s7
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s3, s7
+; GFX942-IR-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX942-IR-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX942-IR-NEXT:    s_add_i32 s5, s4, 1
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s3, s7
+; GFX942-IR-NEXT:    s_cselect_b32 s3, s5, s4
+; GFX942-IR-NEXT:    s_xor_b32 s3, s3, s2
+; GFX942-IR-NEXT:    s_sub_i32 s2, s3, s2
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 32
   %2 = ashr i64 %y, 32
   %result = sdiv i64 %1, %2
@@ -762,6 +1415,88 @@ define amdgpu_kernel void @s_test_sdiv31_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_sdiv31_64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s6, s1, 1
+; GFX942-NEXT:    s_abs_i32 s7, s6
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_sub_i32 s4, 0, s7
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s2, s3, 1
+; GFX942-NEXT:    s_xor_b32 s3, s2, s6
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_abs_i32 s2, s2
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX942-NEXT:    s_mul_i32 s5, s4, s7
+; GFX942-NEXT:    s_sub_i32 s2, s2, s5
+; GFX942-NEXT:    s_add_i32 s6, s4, 1
+; GFX942-NEXT:    s_sub_i32 s5, s2, s7
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX942-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-NEXT:    s_add_i32 s5, s4, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX942-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_sdiv31_64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s6, s1, 1
+; GFX942-IR-NEXT:    s_abs_i32 s7, s6
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_sub_i32 s4, 0, s7
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s3, 1
+; GFX942-IR-NEXT:    s_xor_b32 s3, s2, s6
+; GFX942-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_abs_i32 s2, s2
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-IR-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-IR-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-IR-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-IR-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-IR-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX942-IR-NEXT:    s_mul_i32 s5, s4, s7
+; GFX942-IR-NEXT:    s_sub_i32 s2, s2, s5
+; GFX942-IR-NEXT:    s_add_i32 s6, s4, 1
+; GFX942-IR-NEXT:    s_sub_i32 s5, s2, s7
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX942-IR-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-IR-NEXT:    s_add_i32 s5, s4, 1
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX942-IR-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-IR-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 33
   %2 = ashr i64 %y, 33
   %result = sdiv i64 %1, %2
@@ -831,6 +1566,60 @@ define amdgpu_kernel void @s_test_sdiv23_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_sdiv23_64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s2, s7, 9
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 9
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v1, s3
+; GFX942-NEXT:    s_xor_b32 s2, s3, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    s_or_b32 s4, s2, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v1, -v3, v0, v1
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-NEXT:    v_add_u32_e32 v0, s2, v3
+; GFX942-NEXT:    v_bfe_i32 v0, v0, 0, 23
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_sdiv23_64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s7, 9
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s3, 9
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v1, s3
+; GFX942-IR-NEXT:    s_xor_b32 s2, s3, s2
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-IR-NEXT:    s_or_b32 s4, s2, 1
+; GFX942-IR-NEXT:    v_mul_f32_e32 v3, v1, v3
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-IR-NEXT:    v_fma_f32 v1, -v3, v0, v1
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-IR-NEXT:    v_add_u32_e32 v0, s2, v3
+; GFX942-IR-NEXT:    v_bfe_i32 v0, v0, 0, 23
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-IR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 41
   %2 = ashr i64 %y, 41
   %result = sdiv i64 %1, %2
@@ -926,6 +1715,88 @@ define amdgpu_kernel void @s_test_sdiv25_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_sdiv25_64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s6, s1, 7
+; GFX942-NEXT:    s_abs_i32 s7, s6
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_sub_i32 s4, 0, s7
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s2, s3, 7
+; GFX942-NEXT:    s_xor_b32 s3, s2, s6
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_abs_i32 s2, s2
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX942-NEXT:    s_mul_i32 s5, s4, s7
+; GFX942-NEXT:    s_sub_i32 s2, s2, s5
+; GFX942-NEXT:    s_add_i32 s6, s4, 1
+; GFX942-NEXT:    s_sub_i32 s5, s2, s7
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX942-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-NEXT:    s_add_i32 s5, s4, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX942-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_sdiv25_64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s6, s1, 7
+; GFX942-IR-NEXT:    s_abs_i32 s7, s6
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s7
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_sub_i32 s4, 0, s7
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s3, 7
+; GFX942-IR-NEXT:    s_xor_b32 s3, s2, s6
+; GFX942-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_abs_i32 s2, s2
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-IR-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-IR-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-IR-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-IR-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-IR-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX942-IR-NEXT:    s_mul_i32 s5, s4, s7
+; GFX942-IR-NEXT:    s_sub_i32 s2, s2, s5
+; GFX942-IR-NEXT:    s_add_i32 s6, s4, 1
+; GFX942-IR-NEXT:    s_sub_i32 s5, s2, s7
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX942-IR-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-IR-NEXT:    s_add_i32 s5, s4, 1
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s2, s7
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s5, s4
+; GFX942-IR-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-IR-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 39
   %2 = ashr i64 %y, 39
   %result = sdiv i64 %1, %2
@@ -1025,6 +1896,96 @@ define amdgpu_kernel void @s_test_sdiv24_v2i64(ptr addrspace(1) %out, <2 x i64>
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_sdiv24_v2i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s2, s13, 8
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX942-NEXT:    s_ashr_i32 s3, s9, 8
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v1, s3
+; GFX942-NEXT:    s_xor_b32 s2, s3, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    s_ashr_i32 s4, s11, 8
+; GFX942-NEXT:    s_ashr_i32 s5, s15, 8
+; GFX942-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_fma_f32 v1, -v2, v0, v1
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX942-NEXT:    s_or_b32 s6, s2, 1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s6, 0
+; GFX942-NEXT:    v_add_u32_e32 v0, s2, v2
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, s5
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; GFX942-NEXT:    s_xor_b32 s2, s4, s5
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX942-NEXT:    s_or_b32 s4, s2, 1
+; GFX942-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-NEXT:    v_mul_f32_e32 v5, v3, v5
+; GFX942-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-NEXT:    v_fma_f32 v3, -v5, v2, v3
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v3|, |v2|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-NEXT:    v_add_u32_e32 v2, s2, v5
+; GFX942-NEXT:    v_bfe_i32 v2, v2, 0, 24
+; GFX942-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_sdiv24_v2i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-IR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s13, 8
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s9, 8
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v1, s3
+; GFX942-IR-NEXT:    s_xor_b32 s2, s3, s2
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-IR-NEXT:    s_ashr_i32 s4, s11, 8
+; GFX942-IR-NEXT:    s_ashr_i32 s5, s15, 8
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_fma_f32 v1, -v2, v0, v1
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX942-IR-NEXT:    s_or_b32 s6, s2, 1
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, |v0|
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s6, 0
+; GFX942-IR-NEXT:    v_add_u32_e32 v0, s2, v2
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v2, s5
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v3, s4
+; GFX942-IR-NEXT:    s_xor_b32 s2, s4, s5
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX942-IR-NEXT:    s_or_b32 s4, s2, 1
+; GFX942-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v5, v3, v5
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-IR-NEXT:    v_fma_f32 v3, -v5, v2, v3
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v5, v5
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v3|, |v2|
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-IR-NEXT:    v_add_u32_e32 v2, s2, v5
+; GFX942-IR-NEXT:    v_bfe_i32 v2, v2, 0, 24
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX942-IR-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = ashr <2 x i64> %x, <i64 40, i64 40>
   %2 = ashr <2 x i64> %y, <i64 40, i64 40>
   %result = sdiv <2 x i64> %1, %2
@@ -1100,6 +2061,74 @@ define amdgpu_kernel void @s_test_sdiv24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_sdiv24_48:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s4, s3
+; GFX942-NEXT:    s_mov_b32 s8, s7
+; GFX942-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
+; GFX942-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-NEXT:    v_alignbit_b32 v1, s8, v1, 24
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, v1
+; GFX942-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GFX942-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-NEXT:    v_alignbit_b32 v3, s4, v3, 24
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v4, v3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX942-NEXT:    v_xor_b32_e32 v1, v3, v1
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
+; GFX942-NEXT:    v_or_b32_e32 v1, 1, v1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v4, v5
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v5, v3
+; GFX942-NEXT:    v_fma_f32 v3, -v3, v2, v4
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-NEXT:    v_add_u32_e32 v1, v5, v1
+; GFX942-NEXT:    v_bfe_i32 v1, v1, 0, 24
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    global_store_short v0, v2, s[0:1] offset:4
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_sdiv24_48:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-IR-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_mov_b32 s4, s3
+; GFX942-IR-NEXT:    s_mov_b32 s8, s7
+; GFX942-IR-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-IR-NEXT:    v_alignbit_b32 v1, s8, v1, 24
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v2, v1
+; GFX942-IR-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-IR-NEXT:    v_alignbit_b32 v3, s4, v3, 24
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v4, v3
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX942-IR-NEXT:    v_xor_b32_e32 v1, v3, v1
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
+; GFX942-IR-NEXT:    v_or_b32_e32 v1, 1, v1
+; GFX942-IR-NEXT:    v_mul_f32_e32 v3, v4, v5
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v5, v3
+; GFX942-IR-NEXT:    v_fma_f32 v3, -v3, v2, v4
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-IR-NEXT:    v_add_u32_e32 v1, v5, v1
+; GFX942-IR-NEXT:    v_bfe_i32 v1, v1, 0, 24
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-IR-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-IR-NEXT:    global_store_short v0, v2, s[0:1] offset:4
+; GFX942-IR-NEXT:    s_endpgm
   %1 = ashr i48 %x, 24
   %2 = ashr i48 %y, 24
   %result = sdiv i48 %1, %2
@@ -1296,6 +2325,214 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_sdiv_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s7, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX942-NEXT:    s_add_u32 s2, s2, s4
+; GFX942-NEXT:    s_mov_b32 s5, s4
+; GFX942-NEXT:    s_addc_u32 s3, s3, s4
+; GFX942-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX942-NEXT:    s_sub_u32 s5, 0, s2
+; GFX942-NEXT:    s_subb_u32 s6, 0, s3
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_fmamk_f32 v1, v2, 0xcf800000, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX942-NEXT:    s_mul_i32 s10, s5, s8
+; GFX942-NEXT:    s_mul_hi_u32 s12, s5, s9
+; GFX942-NEXT:    s_mul_i32 s11, s6, s9
+; GFX942-NEXT:    s_add_i32 s10, s12, s10
+; GFX942-NEXT:    s_mul_i32 s13, s5, s9
+; GFX942-NEXT:    s_add_i32 s10, s10, s11
+; GFX942-NEXT:    s_mul_hi_u32 s12, s9, s13
+; GFX942-NEXT:    s_mul_hi_u32 s11, s9, s10
+; GFX942-NEXT:    s_mul_i32 s9, s9, s10
+; GFX942-NEXT:    s_add_u32 s9, s12, s9
+; GFX942-NEXT:    s_addc_u32 s11, 0, s11
+; GFX942-NEXT:    s_mul_hi_u32 s14, s8, s13
+; GFX942-NEXT:    s_mul_i32 s13, s8, s13
+; GFX942-NEXT:    s_add_u32 s9, s9, s13
+; GFX942-NEXT:    s_mul_hi_u32 s12, s8, s10
+; GFX942-NEXT:    s_addc_u32 s9, s11, s14
+; GFX942-NEXT:    s_addc_u32 s11, s12, 0
+; GFX942-NEXT:    s_mul_i32 s10, s8, s10
+; GFX942-NEXT:    s_add_u32 s9, s9, s10
+; GFX942-NEXT:    s_addc_u32 s10, 0, s11
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s9, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s8, s8, s10
+; GFX942-NEXT:    v_readfirstlane_b32 s10, v1
+; GFX942-NEXT:    s_mul_i32 s9, s5, s8
+; GFX942-NEXT:    s_mul_hi_u32 s11, s5, s10
+; GFX942-NEXT:    s_add_i32 s9, s11, s9
+; GFX942-NEXT:    s_mul_i32 s6, s6, s10
+; GFX942-NEXT:    s_add_i32 s9, s9, s6
+; GFX942-NEXT:    s_mul_i32 s5, s5, s10
+; GFX942-NEXT:    s_mul_hi_u32 s11, s8, s5
+; GFX942-NEXT:    s_mul_i32 s12, s8, s5
+; GFX942-NEXT:    s_mul_i32 s14, s10, s9
+; GFX942-NEXT:    s_mul_hi_u32 s5, s10, s5
+; GFX942-NEXT:    s_mul_hi_u32 s13, s10, s9
+; GFX942-NEXT:    s_add_u32 s5, s5, s14
+; GFX942-NEXT:    s_addc_u32 s10, 0, s13
+; GFX942-NEXT:    s_add_u32 s5, s5, s12
+; GFX942-NEXT:    s_mul_hi_u32 s6, s8, s9
+; GFX942-NEXT:    s_addc_u32 s5, s10, s11
+; GFX942-NEXT:    s_addc_u32 s6, s6, 0
+; GFX942-NEXT:    s_mul_i32 s9, s8, s9
+; GFX942-NEXT:    s_add_u32 s5, s5, s9
+; GFX942-NEXT:    s_addc_u32 s6, 0, s6
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s5, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s5, s8, s6
+; GFX942-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX942-NEXT:    s_mul_hi_u32 s6, s5, 24
+; GFX942-NEXT:    s_mul_i32 s5, s5, 24
+; GFX942-NEXT:    s_mul_hi_u32 s8, s8, 24
+; GFX942-NEXT:    s_add_u32 s5, s8, s5
+; GFX942-NEXT:    s_addc_u32 s6, 0, s6
+; GFX942-NEXT:    s_mul_i32 s5, s3, s6
+; GFX942-NEXT:    s_mul_hi_u32 s8, s2, s6
+; GFX942-NEXT:    s_add_i32 s5, s8, s5
+; GFX942-NEXT:    s_mul_i32 s8, s2, s6
+; GFX942-NEXT:    s_sub_i32 s10, 0, s5
+; GFX942-NEXT:    v_sub_co_u32_e64 v1, s[8:9], 24, s8
+; GFX942-NEXT:    s_cmp_lg_u64 s[8:9], 0
+; GFX942-NEXT:    s_subb_u32 s10, s10, s3
+; GFX942-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s10, s10, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s10, s3
+; GFX942-NEXT:    s_cselect_b32 s11, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
+; GFX942-NEXT:    s_cmp_eq_u32 s10, s3
+; GFX942-NEXT:    v_mov_b32_e32 v3, s11
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    s_or_b64 s[10:11], s[6:7], 0
+; GFX942-NEXT:    s_add_u32 s7, s6, 1
+; GFX942-NEXT:    s_addc_u32 s10, s11, 0
+; GFX942-NEXT:    s_add_u32 s12, s6, 2
+; GFX942-NEXT:    s_addc_u32 s13, s11, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX942-NEXT:    s_cmp_lg_u64 s[8:9], 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s7
+; GFX942-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    s_subb_u32 s5, 0, s5
+; GFX942-NEXT:    s_cmp_ge_u32 s5, s3
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, s10
+; GFX942-NEXT:    v_mov_b32_e32 v4, s13
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX942-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
+; GFX942-NEXT:    s_cmp_eq_u32 s5, s3
+; GFX942-NEXT:    v_mov_b32_e32 v4, s7
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, s11
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, s6
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v2, s4, v2
+; GFX942-NEXT:    v_xor_b32_e32 v1, s4, v1
+; GFX942-NEXT:    v_mov_b32_e32 v3, s4
+; GFX942-NEXT:    v_subrev_co_u32_e32 v2, vcc, s4, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_sdiv_k_num_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_mov_b64 s[6:7], 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX942-IR-NEXT:    s_mov_b32 s5, s4
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
+; GFX942-IR-NEXT:    s_sub_u32 s2, s2, s4
+; GFX942-IR-NEXT:    s_subb_u32 s3, s3, s4
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX942-IR-NEXT:    s_flbit_i32_b64 s14, s[2:3]
+; GFX942-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX942-IR-NEXT:    s_add_u32 s8, s14, 0xffffffc5
+; GFX942-IR-NEXT:    s_addc_u32 s9, 0, -1
+; GFX942-IR-NEXT:    v_cmp_gt_u64_e64 s[12:13], s[8:9], 63
+; GFX942-IR-NEXT:    s_or_b64 s[12:13], s[10:11], s[12:13]
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[8:9], 63
+; GFX942-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
+; GFX942-IR-NEXT:    s_and_b64 s[10:11], s[12:13], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s10, 0, 24
+; GFX942-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[16:17]
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
+; GFX942-IR-NEXT:    s_mov_b32 s11, 0
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB10_5
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    s_add_u32 s10, s8, 1
+; GFX942-IR-NEXT:    s_addc_u32 s11, s9, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX942-IR-NEXT:    s_sub_i32 s8, 63, s8
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
+; GFX942-IR-NEXT:    s_lshl_b64 s[8:9], 24, s8
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB10_4
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    s_lshr_b64 s[12:13], 24, s10
+; GFX942-IR-NEXT:    s_add_u32 s16, s2, -1
+; GFX942-IR-NEXT:    s_addc_u32 s17, s3, -1
+; GFX942-IR-NEXT:    s_sub_u32 s10, 58, s14
+; GFX942-IR-NEXT:    s_subb_u32 s11, 0, 0
+; GFX942-IR-NEXT:    s_mov_b64 s[14:15], 0
+; GFX942-IR-NEXT:    s_mov_b32 s7, 0
+; GFX942-IR-NEXT:  .LBB10_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GFX942-IR-NEXT:    s_lshr_b32 s6, s9, 31
+; GFX942-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
+; GFX942-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[6:7]
+; GFX942-IR-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
+; GFX942-IR-NEXT:    s_sub_u32 s6, s16, s12
+; GFX942-IR-NEXT:    s_subb_u32 s6, s17, s13
+; GFX942-IR-NEXT:    s_ashr_i32 s14, s6, 31
+; GFX942-IR-NEXT:    s_mov_b32 s15, s14
+; GFX942-IR-NEXT:    s_and_b32 s6, s14, 1
+; GFX942-IR-NEXT:    s_and_b64 s[14:15], s[14:15], s[2:3]
+; GFX942-IR-NEXT:    s_sub_u32 s12, s12, s14
+; GFX942-IR-NEXT:    s_subb_u32 s13, s13, s15
+; GFX942-IR-NEXT:    s_add_u32 s10, s10, 1
+; GFX942-IR-NEXT:    s_addc_u32 s11, s11, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX942-IR-NEXT:    s_mov_b64 s[14:15], s[6:7]
+; GFX942-IR-NEXT:    s_cbranch_scc0 .LBB10_3
+; GFX942-IR-NEXT:  .LBB10_4: ; %Flow6
+; GFX942-IR-NEXT:    s_lshl_b64 s[2:3], s[8:9], 1
+; GFX942-IR-NEXT:    s_or_b64 s[10:11], s[6:7], s[2:3]
+; GFX942-IR-NEXT:  .LBB10_5: ; %udiv-end
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], s[10:11], s[4:5]
+; GFX942-IR-NEXT:    s_sub_u32 s2, s2, s4
+; GFX942-IR-NEXT:    s_subb_u32 s3, s3, s5
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-IR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %result = sdiv i64 24, %x
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -1494,6 +2731,198 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v13, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_sdiv_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT:    v_xor_b32_e32 v3, v1, v2
+; GFX942-NEXT:    v_xor_b32_e32 v12, v0, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, v12
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v3
+; GFX942-NEXT:    v_sub_co_u32_e32 v11, vcc, 0, v12
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX942-NEXT:    v_subb_co_u32_e32 v13, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v10, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v14, v1
+; GFX942-NEXT:    v_mul_lo_u32 v0, v13, v10
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v11, v10, 0
+; GFX942-NEXT:    v_mul_lo_u32 v1, v11, v14
+; GFX942-NEXT:    v_add3_u32 v5, v5, v1, v0
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v10, v5, 0
+; GFX942-NEXT:    v_mul_hi_u32 v0, v10, v4
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, v[6:7]
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v14, v5, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v14, v4, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, v7, v5, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, v[8:9]
+; GFX942-NEXT:    v_add_co_u32_e32 v15, vcc, v10, v4
+; GFX942-NEXT:    v_mul_lo_u32 v6, v13, v15
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v14, vcc, v14, v5, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v0, v11, v14
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v11, v15, 0
+; GFX942-NEXT:    v_add3_u32 v0, v5, v0, v6
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v14, v0, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v15, v0, 0
+; GFX942-NEXT:    v_mul_hi_u32 v0, v15, v4
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v14, v4, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, v[10:11]
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v8
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, v5, v9, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, v[6:7]
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v15, v4
+; GFX942-NEXT:    v_mul_hi_u32 v0, v0, 24
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v14, v5, vcc
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v4, 24, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, v[4:5]
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v12, v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, v7
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v3, v5, v[0:1]
+; GFX942-NEXT:    v_sub_u32_e32 v0, 0, v8
+; GFX942-NEXT:    v_sub_co_u32_e32 v4, vcc, 24, v6
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v0, s[0:1], v0, v3, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v6, s[0:1], v4, v12
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v0, s[0:1], 0, v0, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[0:1], v0, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[0:1], v6, v12
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[0:1]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], v0, v3
+; GFX942-NEXT:    v_mov_b32_e32 v0, v5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, v7, v6, s[0:1]
+; GFX942-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, 2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 1
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v9
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v1, v7, s[0:1]
+; GFX942-NEXT:    v_subb_co_u32_e32 v7, vcc, 0, v8, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v3
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v0, v6, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v12
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v7, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v8, v4, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v5, v0, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX942-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_sdiv_k_num_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX942-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v4, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-IR-NEXT:    v_add_u32_e32 v4, 32, v4
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v5, v1
+; GFX942-IR-NEXT:    s_movk_i32 s0, 0xffc5
+; GFX942-IR-NEXT:    v_min_u32_e32 v10, v4, v5
+; GFX942-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-IR-NEXT:    s_mov_b32 s1, -1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[8:9], v[10:11], 0, s[0:1]
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], 63, v[8:9]
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[8:9]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v4, 24, 0, s[0:1]
+; GFX942-IR-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[0:1], vcc
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB11_6
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[12:13], v[8:9], 0, 1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v4, 63, v8
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[6:7], 0
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[4:5], v4, 24
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB11_5
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v10, vcc, 58, v10
+; GFX942-IR-NEXT:    v_lshrrev_b64 v[12:13], v12, 24
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e64 v11, s[4:5], 0, 0, vcc
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[8:9], v[0:1], 0, -1
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[14:15], 0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-IR-NEXT:  .LBB11_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[12:13], 1, v[12:13]
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
+; GFX942-IR-NEXT:    v_or_b32_e32 v12, v12, v6
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v6, vcc, v8, v12
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v6, vcc, v9, v13, vcc
+; GFX942-IR-NEXT:    v_or_b32_e32 v4, v14, v4
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v6
+; GFX942-IR-NEXT:    v_or_b32_e32 v5, v15, v5
+; GFX942-IR-NEXT:    v_and_b32_e32 v6, 1, v14
+; GFX942-IR-NEXT:    v_and_b32_e32 v15, v14, v1
+; GFX942-IR-NEXT:    v_and_b32_e32 v14, v14, v0
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v12, vcc, v12, v14
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[10:11], v[10:11], 0, 1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v13, vcc, v13, v15, vcc
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
+; GFX942-IR-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX942-IR-NEXT:  ; %bb.4: ; %Flow
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:  .LBB11_5: ; %Flow4
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[0:1], 1, v[4:5]
+; GFX942-IR-NEXT:    v_or_b32_e32 v11, v7, v1
+; GFX942-IR-NEXT:    v_or_b32_e32 v4, v6, v0
+; GFX942-IR-NEXT:  .LBB11_6: ; %Flow5
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-IR-NEXT:    v_xor_b32_e32 v0, v4, v2
+; GFX942-IR-NEXT:    v_xor_b32_e32 v1, v11, v3
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 24, %x
   ret i64 %result
 }
@@ -1689,6 +3118,198 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v13, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_sdiv_pow2_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT:    v_xor_b32_e32 v3, v1, v2
+; GFX942-NEXT:    v_xor_b32_e32 v12, v0, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, v12
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v3
+; GFX942-NEXT:    v_sub_co_u32_e32 v11, vcc, 0, v12
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX942-NEXT:    v_subb_co_u32_e32 v13, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v10, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v14, v1
+; GFX942-NEXT:    v_mul_lo_u32 v0, v13, v10
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v11, v10, 0
+; GFX942-NEXT:    v_mul_lo_u32 v1, v11, v14
+; GFX942-NEXT:    v_add3_u32 v5, v5, v1, v0
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v10, v5, 0
+; GFX942-NEXT:    v_mul_hi_u32 v0, v10, v4
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, v[6:7]
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v14, v5, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v14, v4, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v6, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, v7, v5, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, v[8:9]
+; GFX942-NEXT:    v_add_co_u32_e32 v15, vcc, v10, v4
+; GFX942-NEXT:    v_mul_lo_u32 v6, v13, v15
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v14, vcc, v14, v5, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v0, v11, v14
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v11, v15, 0
+; GFX942-NEXT:    v_add3_u32 v0, v5, v0, v6
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v14, v0, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v15, v0, 0
+; GFX942-NEXT:    v_mul_hi_u32 v0, v15, v4
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v14, v4, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, v[10:11]
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v4, v8
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, v5, v9, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, v[6:7]
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v15, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, v14, v5, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 17, v0
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v12, v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v6, v5
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v3, v0, v[6:7]
+; GFX942-NEXT:    s_mov_b32 s0, 0x8000
+; GFX942-NEXT:    v_sub_u32_e32 v5, 0, v6
+; GFX942-NEXT:    v_sub_co_u32_e32 v7, vcc, s0, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v5, v3, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v5, s[0:1], v7, v12
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[0:1], v5, v12
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], v4, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v10, v8, v5, s[0:1]
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[0:1], 0, 2
+; GFX942-NEXT:    v_lshl_add_u64 v[8:9], v[0:1], 0, 1
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v9, v5, s[0:1]
+; GFX942-NEXT:    v_subb_co_u32_e32 v5, vcc, 0, v6, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v12
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v6, v7, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v8, v4, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX942-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_sdiv_pow2_k_num_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX942-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v4, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-IR-NEXT:    v_add_u32_e32 v4, 32, v4
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v5, v1
+; GFX942-IR-NEXT:    s_movk_i32 s0, 0xffd0
+; GFX942-IR-NEXT:    v_min_u32_e32 v10, v4, v5
+; GFX942-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-IR-NEXT:    s_mov_b32 s1, -1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[8:9], v[10:11], 0, s[0:1]
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], 63, v[8:9]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v4, 0x8000
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v4, v4, 0, s[0:1]
+; GFX942-IR-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[8:9]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[0:1], vcc
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB12_6
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[12:13], v[8:9], 0, 1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v4, 63, v8
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[6:7], 0
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[4:5], v4, s[4:5]
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB12_5
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v10, vcc, 47, v10
+; GFX942-IR-NEXT:    v_lshrrev_b64 v[12:13], v12, s[4:5]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e64 v11, s[4:5], 0, 0, vcc
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[8:9], v[0:1], 0, -1
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[14:15], 0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-IR-NEXT:  .LBB12_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[12:13], 1, v[12:13]
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
+; GFX942-IR-NEXT:    v_or_b32_e32 v12, v12, v6
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v6, vcc, v8, v12
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v6, vcc, v9, v13, vcc
+; GFX942-IR-NEXT:    v_or_b32_e32 v4, v14, v4
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v6
+; GFX942-IR-NEXT:    v_or_b32_e32 v5, v15, v5
+; GFX942-IR-NEXT:    v_and_b32_e32 v6, 1, v14
+; GFX942-IR-NEXT:    v_and_b32_e32 v15, v14, v1
+; GFX942-IR-NEXT:    v_and_b32_e32 v14, v14, v0
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v12, vcc, v12, v14
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[10:11], v[10:11], 0, 1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v13, vcc, v13, v15, vcc
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[14:15], v[6:7]
+; GFX942-IR-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_execnz .LBB12_3
+; GFX942-IR-NEXT:  ; %bb.4: ; %Flow
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:  .LBB12_5: ; %Flow4
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[0:1], 1, v[4:5]
+; GFX942-IR-NEXT:    v_or_b32_e32 v11, v7, v1
+; GFX942-IR-NEXT:    v_or_b32_e32 v4, v6, v0
+; GFX942-IR-NEXT:  .LBB12_6: ; %Flow5
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-IR-NEXT:    v_xor_b32_e32 v0, v4, v2
+; GFX942-IR-NEXT:    v_xor_b32_e32 v1, v11, v3
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 32768, %x
   ret i64 %result
 }
@@ -1786,6 +3407,101 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v10
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v11, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_sdiv_pow2_k_den_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 17, v2
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT:    v_ashrrev_i64 v[0:1], 15, v[0:1]
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_sdiv_pow2_k_den_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX942-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v6, vcc, v0, v2
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v0, v6
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v7, vcc, v1, v2, vcc
+; GFX942-IR-NEXT:    v_add_u32_e32 v0, 32, v0
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v1, v7
+; GFX942-IR-NEXT:    v_min_u32_e32 v8, v0, v1
+; GFX942-IR-NEXT:    v_sub_co_u32_e64 v0, s[0:1], 48, v8
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e64 v1, s[0:1], 0, 0, s[0:1]
+; GFX942-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], 63, v[0:1]
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[0:1]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v5, v7, 0, s[0:1]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v4, v6, 0, s[0:1]
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB13_6
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[10:11], v[0:1], 0, 1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v0, 63, v0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[4:5], 0
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[0:1], v0, v[6:7]
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB13_5
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    s_movk_i32 s4, 0xffcf
+; GFX942-IR-NEXT:    s_mov_b32 s5, -1
+; GFX942-IR-NEXT:    v_lshrrev_b64 v[10:11], v10, v[6:7]
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[6:7], v[8:9], 0, s[4:5]
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[8:9], 0
+; GFX942-IR-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX942-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-IR-NEXT:  .LBB13_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[10:11], 1, v[10:11]
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v1
+; GFX942-IR-NEXT:    v_or_b32_e32 v10, v10, v4
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v4, vcc, s6, v10
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v4, vcc, 0, v11, vcc
+; GFX942-IR-NEXT:    v_or_b32_e32 v0, v8, v0
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v8, 31, v4
+; GFX942-IR-NEXT:    v_and_b32_e32 v4, 1, v8
+; GFX942-IR-NEXT:    v_and_b32_e32 v8, 0x8000, v8
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v10, vcc, v10, v8
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[6:7], v[6:7], 0, 1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subbrev_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX942-IR-NEXT:    v_or_b32_e32 v1, v9, v1
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[8:9], v[4:5]
+; GFX942-IR-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_execnz .LBB13_3
+; GFX942-IR-NEXT:  ; %bb.4: ; %Flow
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:  .LBB13_5: ; %Flow4
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GFX942-IR-NEXT:    v_or_b32_e32 v5, v5, v1
+; GFX942-IR-NEXT:    v_or_b32_e32 v4, v4, v0
+; GFX942-IR-NEXT:  .LBB13_6: ; %Flow5
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-IR-NEXT:    v_xor_b32_e32 v0, v4, v2
+; GFX942-IR-NEXT:    v_xor_b32_e32 v1, v5, v3
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = sdiv i64 %x, 32768
   ret i64 %result
 }
@@ -1844,6 +3560,56 @@ define amdgpu_kernel void @s_test_sdiv24_k_num_i64(ptr addrspace(1) %out, i64 %x
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_sdiv24_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s2, s3, 8
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX942-NEXT:    s_mov_b32 s2, 0x41c00000
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-NEXT:    s_or_b32 s4, s3, 1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_fma_f32 v3, -v1, v0, s2
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v3|, |v0|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-NEXT:    v_add_u32_e32 v0, s2, v1
+; GFX942-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_sdiv24_k_num_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s3, 8
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX942-IR-NEXT:    s_mov_b32 s2, 0x41c00000
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-IR-NEXT:    s_or_b32 s4, s3, 1
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-IR-NEXT:    v_fma_f32 v3, -v1, v0, s2
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v3|, |v0|
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-IR-NEXT:    v_add_u32_e32 v0, s2, v1
+; GFX942-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-IR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %x.shr = ashr i64 %x, 40
   %result = sdiv i64 24, %x.shr
   store i64 %result, ptr addrspace(1) %out
@@ -1902,6 +3668,52 @@ define amdgpu_kernel void @s_test_sdiv24_k_den_i64(ptr addrspace(1) %out, i64 %x
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_sdiv24_k_den_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s2, s3, 8
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX942-NEXT:    s_ashr_i32 s2, s3, 31
+; GFX942-NEXT:    s_or_b32 s4, s2, 1
+; GFX942-NEXT:    s_mov_b32 s2, 0x46b6fe00
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0xc6b6fe00, v0
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v0|, s2
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-NEXT:    v_add_u32_e32 v0, s2, v1
+; GFX942-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_sdiv24_k_den_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s3, 8
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v0, s2
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s3, 31
+; GFX942-IR-NEXT:    s_or_b32 s4, s2, 1
+; GFX942-IR-NEXT:    s_mov_b32 s2, 0x46b6fe00
+; GFX942-IR-NEXT:    v_mul_f32_e32 v1, 0x38331158, v0
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-IR-NEXT:    v_fmamk_f32 v0, v1, 0xc6b6fe00, v0
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v0|, s2
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s4, 0
+; GFX942-IR-NEXT:    v_add_u32_e32 v0, s2, v1
+; GFX942-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-IR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %x.shr = ashr i64 %x, 40
   %result = sdiv i64 %x.shr, 23423
   store i64 %result, ptr addrspace(1) %out
@@ -1948,6 +3760,50 @@ define i64 @v_test_sdiv24_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_sdiv24_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v0, 8, v1
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX942-NEXT:    s_mov_b32 s0, 0x41c00000
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX942-NEXT:    v_or_b32_e32 v1, 1, v1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v2
+; GFX942-NEXT:    v_fma_f32 v2, -v2, v0, s0
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX942-NEXT:    v_add_u32_e32 v0, v3, v0
+; GFX942-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_sdiv24_k_num_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v0, 8, v1
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX942-IR-NEXT:    s_mov_b32 s0, 0x41c00000
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX942-IR-NEXT:    v_or_b32_e32 v1, 1, v1
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v3, v2
+; GFX942-IR-NEXT:    v_fma_f32 v2, -v2, v0, s0
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX942-IR-NEXT:    v_add_u32_e32 v0, v3, v0
+; GFX942-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = sdiv i64 24, %x.shr
   ret i64 %result
@@ -1993,6 +3849,50 @@ define i64 @v_test_sdiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_sdiv24_pow2_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v0, 8, v1
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX942-NEXT:    s_mov_b32 s0, 0x47000000
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX942-NEXT:    v_or_b32_e32 v1, 1, v1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x47000000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v2
+; GFX942-NEXT:    v_fma_f32 v2, -v2, v0, s0
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX942-NEXT:    v_add_u32_e32 v0, v3, v0
+; GFX942-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_sdiv24_pow2_k_num_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v0, 8, v1
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX942-IR-NEXT:    s_mov_b32 s0, 0x47000000
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX942-IR-NEXT:    v_or_b32_e32 v1, 1, v1
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, 0x47000000, v2
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v3, v2
+; GFX942-IR-NEXT:    v_fma_f32 v2, -v2, v0, s0
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, |v0|
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX942-IR-NEXT:    v_add_u32_e32 v0, v3, v0
+; GFX942-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = sdiv i64 32768, %x.shr
   ret i64 %result
@@ -2028,6 +3928,37 @@ define i64 @v_test_sdiv24_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_sdiv24_pow2_k_den_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 8, v1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 17, v3
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX942-NEXT:    v_ashrrev_i64 v[0:1], 15, v[0:1]
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_sdiv24_pow2_k_den_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v0, 8, v1
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX942-IR-NEXT:    s_mov_b32 s0, 0x47000000
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX942-IR-NEXT:    v_or_b32_e32 v1, 1, v1
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, 0x38000000, v0
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v3, v2
+; GFX942-IR-NEXT:    v_fmamk_f32 v0, v2, 0xc7000000, v0
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s0
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_cndmask_b32_e32 v0, 0, v1, vcc
+; GFX942-IR-NEXT:    v_add_u32_e32 v0, v3, v0
+; GFX942-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = sdiv i64 %x.shr, 32768
   ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
index 3a2d056dc504a..76ff3a6a23a90 100644
--- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=kaveri < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
 
 define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) {
 ; GCN-LABEL: v_shl_i128_vv:
@@ -22,6 +23,27 @@ define i128 @v_shl_i128_vv(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_shl_i128_vv:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_sub_u32_e32 v5, 64, v4
+; GFX942-NEXT:    v_lshlrev_b64 v[6:7], v4, v[2:3]
+; GFX942-NEXT:    v_lshrrev_b64 v[8:9], v5, v[0:1]
+; GFX942-NEXT:    v_or_b32_e32 v8, v6, v8
+; GFX942-NEXT:    v_subrev_u32_e32 v6, 64, v4
+; GFX942-NEXT:    v_or_b32_e32 v5, v7, v9
+; GFX942-NEXT:    v_lshlrev_b64 v[6:7], v6, v[0:1]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX942-NEXT:    v_lshlrev_b64 v[0:1], v4, v[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v6, v2, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v5, v3, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i128 %lhs, %rhs
   ret i128 %shl
 }
@@ -47,6 +69,27 @@ define i128 @v_lshr_i128_vv(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_lshr_i128_vv:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_sub_u32_e32 v5, 64, v4
+; GFX942-NEXT:    v_lshrrev_b64 v[6:7], v4, v[0:1]
+; GFX942-NEXT:    v_lshlrev_b64 v[8:9], v5, v[2:3]
+; GFX942-NEXT:    v_or_b32_e32 v8, v6, v8
+; GFX942-NEXT:    v_subrev_u32_e32 v6, 64, v4
+; GFX942-NEXT:    v_or_b32_e32 v5, v7, v9
+; GFX942-NEXT:    v_lshrrev_b64 v[6:7], v6, v[2:3]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX942-NEXT:    v_lshrrev_b64 v[2:3], v4, v[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
 
   %shl = lshr i128 %lhs, %rhs
   ret i128 %shl
@@ -74,6 +117,29 @@ define i128 @v_ashr_i128_vv(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_ashr_i128_vv:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_sub_u32_e32 v5, 64, v4
+; GFX942-NEXT:    v_lshrrev_b64 v[6:7], v4, v[0:1]
+; GFX942-NEXT:    v_lshlrev_b64 v[8:9], v5, v[2:3]
+; GFX942-NEXT:    v_or_b32_e32 v8, v6, v8
+; GFX942-NEXT:    v_subrev_u32_e32 v6, 64, v4
+; GFX942-NEXT:    v_or_b32_e32 v5, v7, v9
+; GFX942-NEXT:    v_ashrrev_i64 v[6:7], v6, v[2:3]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], 0, v4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v7, v5, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v6, v6, v8, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v5, v1, s[0:1]
+; GFX942-NEXT:    v_ashrrev_i64 v[4:5], v4, v[2:3]
+; GFX942-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v6, v0, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %shl = ashr i128 %lhs, %rhs
   ret i128 %shl
 }
@@ -89,6 +155,16 @@ define i128 @v_shl_i128_vk(i128 %lhs) {
 ; GCN-NEXT:    v_alignbit_b32 v1, v1, v0, 15
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_shl_i128_vk:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b32_e32 v4, 15, v1
+; GFX942-NEXT:    v_lshlrev_b64 v[2:3], 17, v[2:3]
+; GFX942-NEXT:    v_or_b32_e32 v2, v2, v4
+; GFX942-NEXT:    v_alignbit_b32 v1, v1, v0, 15
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 17, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i128 %lhs, 17
   ret i128 %shl
 }
@@ -102,6 +178,15 @@ define i128 @v_lshr_i128_vk(i128 %lhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_lshr_i128_vk:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_alignbit_b32 v0, v3, v2, 1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 1, v3
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %shl = lshr i128 %lhs, 65
   ret i128 %shl
 }
@@ -117,6 +202,16 @@ define i128 @v_ashr_i128_vk(i128 %lhs) {
 ; GCN-NEXT:    v_ashrrev_i32_e32 v2, 1, v3
 ; GCN-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_ashr_i128_vk:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b32_e32 v4, 1, v1
+; GFX942-NEXT:    v_lshlrev_b64 v[0:1], 31, v[2:3]
+; GFX942-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 1, v3
+; GFX942-NEXT:    v_ashrrev_i32_e32 v3, 31, v3
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %shl = ashr i128 %lhs, 33
   ret i128 %shl
 }
@@ -139,6 +234,25 @@ define i128 @v_shl_i128_kv(i128 %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GCN-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_shl_i128_kv:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_sub_u32_e32 v1, 64, v0
+; GFX942-NEXT:    v_lshrrev_b64 v[2:3], v1, 17
+; GFX942-NEXT:    v_subrev_u32_e32 v1, 64, v0
+; GFX942-NEXT:    v_lshlrev_b64 v[4:5], v1, 17
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, v1, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, 0, v1, s[0:1]
+; GFX942-NEXT:    v_lshlrev_b64 v[0:1], v0, 17
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i128 17, %rhs
   ret i128 %shl
 }
@@ -158,6 +272,23 @@ define i128 @v_lshr_i128_kv(i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_lshr_i128_kv:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b64 s[0:1], 0x41
+; GFX942-NEXT:    v_lshrrev_b64 v[2:3], v0, s[0:1]
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0x41
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %shl = lshr i128 65, %rhs
   ret i128 %shl
 }
@@ -175,6 +306,21 @@ define i128 @v_ashr_i128_kv(i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v2, 0
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_ashr_i128_kv:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b64 v[2:3], v0, 33
+; GFX942-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v2, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, 33, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %shl = ashr i128 33, %rhs
   ret i128 %shl
 }
@@ -210,6 +356,33 @@ define amdgpu_kernel void @s_shl_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v3, s1
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_shl_i128_ss:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_sub_i32 s6, 64, s12
+; GFX942-NEXT:    s_sub_i32 s4, s12, 64
+; GFX942-NEXT:    s_lshl_b64 s[0:1], s[10:11], s12
+; GFX942-NEXT:    s_lshr_b64 s[6:7], s[8:9], s6
+; GFX942-NEXT:    s_lshl_b64 s[2:3], s[8:9], s12
+; GFX942-NEXT:    s_lshl_b64 s[4:5], s[8:9], s4
+; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX942-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX942-NEXT:    s_cselect_b32 s0, s0, s4
+; GFX942-NEXT:    s_cselect_b32 s1, s1, s5
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cselect_b32 s2, s2, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX942-NEXT:    s_cselect_b32 s1, s11, s1
+; GFX942-NEXT:    s_cselect_b32 s0, s10, s0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    v_mov_b32_e32 v4, s0
+; GFX942-NEXT:    v_mov_b32_e32 v5, s1
+; GFX942-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-NEXT:    s_endpgm
   %shift = shl i128 %lhs, %rhs
   store i128 %shift, ptr addrspace(1) null
   ret void
@@ -246,6 +419,33 @@ define amdgpu_kernel void @s_lshr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v3, s5
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_lshr_i128_ss:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_sub_i32 s6, 64, s12
+; GFX942-NEXT:    s_sub_i32 s4, s12, 64
+; GFX942-NEXT:    s_lshr_b64 s[0:1], s[8:9], s12
+; GFX942-NEXT:    s_lshl_b64 s[6:7], s[10:11], s6
+; GFX942-NEXT:    s_lshr_b64 s[2:3], s[10:11], s12
+; GFX942-NEXT:    s_lshr_b64 s[4:5], s[10:11], s4
+; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX942-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX942-NEXT:    s_cselect_b32 s0, s0, s4
+; GFX942-NEXT:    s_cselect_b32 s1, s1, s5
+; GFX942-NEXT:    s_cselect_b32 s3, s3, 0
+; GFX942-NEXT:    s_cselect_b32 s2, s2, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX942-NEXT:    s_cselect_b32 s1, s9, s1
+; GFX942-NEXT:    s_cselect_b32 s0, s8, s0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v5, s3
+; GFX942-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-NEXT:    s_endpgm
   %shift = lshr i128 %lhs, %rhs
   store i128 %shift, ptr addrspace(1) null
   ret void
@@ -283,6 +483,34 @@ define amdgpu_kernel void @s_ashr_i128_ss(i128 %lhs, i128 %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v3, s3
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_ashr_i128_ss:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_sub_i32 s2, 64, s12
+; GFX942-NEXT:    s_lshr_b64 s[0:1], s[8:9], s12
+; GFX942-NEXT:    s_sub_i32 s4, s12, 64
+; GFX942-NEXT:    s_lshl_b64 s[2:3], s[10:11], s2
+; GFX942-NEXT:    s_ashr_i32 s6, s11, 31
+; GFX942-NEXT:    s_ashr_i64 s[4:5], s[10:11], s4
+; GFX942-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX942-NEXT:    s_ashr_i64 s[2:3], s[10:11], s12
+; GFX942-NEXT:    s_cmp_lt_u32 s12, 64
+; GFX942-NEXT:    s_cselect_b32 s3, s3, s6
+; GFX942-NEXT:    s_cselect_b32 s2, s2, s6
+; GFX942-NEXT:    s_cselect_b32 s0, s0, s4
+; GFX942-NEXT:    s_cselect_b32 s1, s1, s5
+; GFX942-NEXT:    s_cmp_eq_u32 s12, 0
+; GFX942-NEXT:    s_cselect_b32 s1, s9, s1
+; GFX942-NEXT:    s_cselect_b32 s0, s8, s0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v5, s3
+; GFX942-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-NEXT:    s_endpgm
   %shift = ashr i128 %lhs, %rhs
   store i128 %shift, ptr addrspace(1) null
   ret void
@@ -333,6 +561,51 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_shl_v2i128_vv:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_sub_u32_e32 v16, 64, v8
+; GFX942-NEXT:    v_lshrrev_b64 v[16:17], v16, v[0:1]
+; GFX942-NEXT:    v_lshlrev_b64 v[18:19], v8, v[2:3]
+; GFX942-NEXT:    v_or_b32_e32 v18, v18, v16
+; GFX942-NEXT:    v_cmp_gt_u64_e32 vcc, 64, v[8:9]
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[10:11]
+; GFX942-NEXT:    v_subrev_u32_e32 v16, 64, v8
+; GFX942-NEXT:    v_or_b32_e32 v19, v19, v17
+; GFX942-NEXT:    v_lshlrev_b64 v[16:17], v16, v[0:1]
+; GFX942-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX942-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX942-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[10:11]
+; GFX942-NEXT:    v_cndmask_b32_e32 v9, v17, v19, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v9, v3, s[0:1]
+; GFX942-NEXT:    v_sub_u32_e32 v9, 64, v12
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v16, v2, s[0:1]
+; GFX942-NEXT:    v_lshrrev_b64 v[10:11], v9, v[4:5]
+; GFX942-NEXT:    v_lshlrev_b64 v[16:17], v12, v[6:7]
+; GFX942-NEXT:    v_or_b32_e32 v16, v16, v10
+; GFX942-NEXT:    v_cmp_gt_u64_e64 s[0:1], 64, v[12:13]
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[14:15]
+; GFX942-NEXT:    v_subrev_u32_e32 v10, 64, v12
+; GFX942-NEXT:    v_or_b32_e32 v9, v17, v11
+; GFX942-NEXT:    v_lshlrev_b64 v[10:11], v10, v[4:5]
+; GFX942-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX942-NEXT:    v_or_b32_e32 v15, v13, v15
+; GFX942-NEXT:    v_or_b32_e32 v14, v12, v14
+; GFX942-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s[0:1]
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[14:15]
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, v11, v9, s[0:1]
+; GFX942-NEXT:    v_lshlrev_b64 v[0:1], v8, v[0:1]
+; GFX942-NEXT:    v_lshlrev_b64 v[4:5], v12, v[4:5]
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, v10, v6, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, v9, v7, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, 0, v4, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, 0, v5, s[0:1]
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
 }
@@ -382,6 +655,51 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, 0, v7, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_lshr_v2i128_vv:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_sub_u32_e32 v16, 64, v8
+; GFX942-NEXT:    v_lshlrev_b64 v[16:17], v16, v[2:3]
+; GFX942-NEXT:    v_lshrrev_b64 v[18:19], v8, v[0:1]
+; GFX942-NEXT:    v_or_b32_e32 v18, v18, v16
+; GFX942-NEXT:    v_cmp_gt_u64_e32 vcc, 64, v[8:9]
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[10:11]
+; GFX942-NEXT:    v_subrev_u32_e32 v16, 64, v8
+; GFX942-NEXT:    v_or_b32_e32 v19, v19, v17
+; GFX942-NEXT:    v_lshrrev_b64 v[16:17], v16, v[2:3]
+; GFX942-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX942-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX942-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[10:11]
+; GFX942-NEXT:    v_cndmask_b32_e32 v9, v17, v19, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[0:1]
+; GFX942-NEXT:    v_sub_u32_e32 v9, 64, v12
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[0:1]
+; GFX942-NEXT:    v_lshlrev_b64 v[10:11], v9, v[6:7]
+; GFX942-NEXT:    v_lshrrev_b64 v[16:17], v12, v[4:5]
+; GFX942-NEXT:    v_or_b32_e32 v16, v16, v10
+; GFX942-NEXT:    v_cmp_gt_u64_e64 s[0:1], 64, v[12:13]
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[14:15]
+; GFX942-NEXT:    v_subrev_u32_e32 v10, 64, v12
+; GFX942-NEXT:    v_or_b32_e32 v9, v17, v11
+; GFX942-NEXT:    v_lshrrev_b64 v[10:11], v10, v[6:7]
+; GFX942-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX942-NEXT:    v_or_b32_e32 v15, v13, v15
+; GFX942-NEXT:    v_or_b32_e32 v14, v12, v14
+; GFX942-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s[0:1]
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[14:15]
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, v11, v9, s[0:1]
+; GFX942-NEXT:    v_lshrrev_b64 v[2:3], v8, v[2:3]
+; GFX942-NEXT:    v_lshrrev_b64 v[6:7], v12, v[6:7]
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, 0, v6, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, 0, v7, s[0:1]
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %shl = lshr <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
 }
@@ -433,6 +751,53 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_cndmask_b32_e64 v6, v7, v8, s[4:5]
 ; GCN-NEXT:    v_cndmask_b32_e64 v7, v7, v9, s[4:5]
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_ashr_v2i128_vv:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_sub_u32_e32 v16, 64, v8
+; GFX942-NEXT:    v_lshlrev_b64 v[16:17], v16, v[2:3]
+; GFX942-NEXT:    v_lshrrev_b64 v[18:19], v8, v[0:1]
+; GFX942-NEXT:    v_or_b32_e32 v18, v18, v16
+; GFX942-NEXT:    v_cmp_gt_u64_e32 vcc, 64, v[8:9]
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[10:11]
+; GFX942-NEXT:    v_subrev_u32_e32 v16, 64, v8
+; GFX942-NEXT:    v_or_b32_e32 v19, v19, v17
+; GFX942-NEXT:    v_ashrrev_i64 v[16:17], v16, v[2:3]
+; GFX942-NEXT:    s_and_b64 vcc, s[0:1], vcc
+; GFX942-NEXT:    v_or_b32_e32 v11, v9, v11
+; GFX942-NEXT:    v_or_b32_e32 v10, v8, v10
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[10:11]
+; GFX942-NEXT:    v_cndmask_b32_e32 v9, v17, v19, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v16, v16, v18, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v9, v1, s[0:1]
+; GFX942-NEXT:    v_sub_u32_e32 v9, 64, v12
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, v16, v0, s[0:1]
+; GFX942-NEXT:    v_lshlrev_b64 v[10:11], v9, v[6:7]
+; GFX942-NEXT:    v_lshrrev_b64 v[16:17], v12, v[4:5]
+; GFX942-NEXT:    v_or_b32_e32 v16, v16, v10
+; GFX942-NEXT:    v_cmp_gt_u64_e64 s[0:1], 64, v[12:13]
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[14:15]
+; GFX942-NEXT:    v_subrev_u32_e32 v10, 64, v12
+; GFX942-NEXT:    v_or_b32_e32 v9, v17, v11
+; GFX942-NEXT:    v_ashrrev_i64 v[10:11], v10, v[6:7]
+; GFX942-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX942-NEXT:    v_or_b32_e32 v15, v13, v15
+; GFX942-NEXT:    v_or_b32_e32 v14, v12, v14
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[2:3], 0, v[14:15]
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, v11, v9, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v10, v10, v16, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, v9, v5, s[2:3]
+; GFX942-NEXT:    v_ashrrev_i32_e32 v9, 31, v3
+; GFX942-NEXT:    v_ashrrev_i64 v[2:3], v8, v[2:3]
+; GFX942-NEXT:    v_ashrrev_i32_e32 v8, 31, v7
+; GFX942-NEXT:    v_ashrrev_i64 v[6:7], v12, v[6:7]
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, v10, v4, s[2:3]
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v9, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v9, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, v8, v7, s[0:1]
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %shl = ashr <2 x i128> %lhs, %rhs
   ret <2 x i128> %shl
 }
@@ -507,6 +872,67 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v3, s9
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_shl_v2i128ss:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 16
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u64 s[18:19], 0
+; GFX942-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[16:17], 64
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_sub_i32 s4, s16, 64
+; GFX942-NEXT:    s_sub_i32 s5, 64, s16
+; GFX942-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX942-NEXT:    s_lshl_b64 s[2:3], s[8:9], s4
+; GFX942-NEXT:    s_lshr_b64 s[4:5], s[8:9], s5
+; GFX942-NEXT:    s_lshl_b64 s[6:7], s[10:11], s16
+; GFX942-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX942-NEXT:    s_and_b64 s[6:7], s[0:1], exec
+; GFX942-NEXT:    s_cselect_b32 s4, s4, s2
+; GFX942-NEXT:    s_cselect_b32 s5, s5, s3
+; GFX942-NEXT:    s_or_b64 s[2:3], s[16:17], s[18:19]
+; GFX942-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX942-NEXT:    s_cselect_b32 s17, s11, s5
+; GFX942-NEXT:    s_cselect_b32 s18, s10, s4
+; GFX942-NEXT:    s_cmp_eq_u64 s[22:23], 0
+; GFX942-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[20:21], 64
+; GFX942-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX942-NEXT:    s_sub_i32 s6, 64, s20
+; GFX942-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; GFX942-NEXT:    s_sub_i32 s4, s20, 64
+; GFX942-NEXT:    s_lshr_b64 s[6:7], s[12:13], s6
+; GFX942-NEXT:    s_lshl_b64 s[10:11], s[14:15], s20
+; GFX942-NEXT:    s_lshl_b64 s[4:5], s[12:13], s4
+; GFX942-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX942-NEXT:    s_and_b64 s[10:11], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s6, s6, s4
+; GFX942-NEXT:    s_cselect_b32 s7, s7, s5
+; GFX942-NEXT:    s_or_b64 s[4:5], s[20:21], s[22:23]
+; GFX942-NEXT:    s_cmp_eq_u64 s[4:5], 0
+; GFX942-NEXT:    s_cselect_b32 s7, s15, s7
+; GFX942-NEXT:    s_cselect_b32 s6, s14, s6
+; GFX942-NEXT:    s_lshl_b64 s[4:5], s[8:9], s16
+; GFX942-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX942-NEXT:    s_cselect_b32 s5, s5, 0
+; GFX942-NEXT:    s_cselect_b32 s4, s4, 0
+; GFX942-NEXT:    s_lshl_b64 s[0:1], s[12:13], s20
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX942-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s1
+; GFX942-NEXT:    v_mov_b32_e32 v4, s6
+; GFX942-NEXT:    v_mov_b32_e32 v5, s7
+; GFX942-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-NEXT:    v_mov_b32_e32 v3, s5
+; GFX942-NEXT:    v_mov_b32_e32 v4, s18
+; GFX942-NEXT:    v_mov_b32_e32 v5, s17
+; GFX942-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-NEXT:    s_endpgm
   %shift = shl <2 x i128> %lhs, %rhs
   store <2 x i128> %shift, ptr addrspace(1) null
   ret void
@@ -582,6 +1008,67 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_lshr_v2i128_ss:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 16
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u64 s[18:19], 0
+; GFX942-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[16:17], 64
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_sub_i32 s4, s16, 64
+; GFX942-NEXT:    s_sub_i32 s5, 64, s16
+; GFX942-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX942-NEXT:    s_lshr_b64 s[2:3], s[10:11], s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], s[10:11], s5
+; GFX942-NEXT:    s_lshr_b64 s[6:7], s[8:9], s16
+; GFX942-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX942-NEXT:    s_and_b64 s[6:7], s[0:1], exec
+; GFX942-NEXT:    s_cselect_b32 s4, s4, s2
+; GFX942-NEXT:    s_cselect_b32 s5, s5, s3
+; GFX942-NEXT:    s_or_b64 s[2:3], s[16:17], s[18:19]
+; GFX942-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX942-NEXT:    s_cselect_b32 s17, s9, s5
+; GFX942-NEXT:    s_cselect_b32 s18, s8, s4
+; GFX942-NEXT:    s_cmp_eq_u64 s[22:23], 0
+; GFX942-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[20:21], 64
+; GFX942-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX942-NEXT:    s_sub_i32 s6, 64, s20
+; GFX942-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; GFX942-NEXT:    s_sub_i32 s4, s20, 64
+; GFX942-NEXT:    s_lshl_b64 s[6:7], s[14:15], s6
+; GFX942-NEXT:    s_lshr_b64 s[8:9], s[12:13], s20
+; GFX942-NEXT:    s_lshr_b64 s[4:5], s[14:15], s4
+; GFX942-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX942-NEXT:    s_and_b64 s[8:9], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s6, s6, s4
+; GFX942-NEXT:    s_cselect_b32 s7, s7, s5
+; GFX942-NEXT:    s_or_b64 s[4:5], s[20:21], s[22:23]
+; GFX942-NEXT:    s_cmp_eq_u64 s[4:5], 0
+; GFX942-NEXT:    s_cselect_b32 s7, s13, s7
+; GFX942-NEXT:    s_cselect_b32 s6, s12, s6
+; GFX942-NEXT:    s_lshr_b64 s[4:5], s[10:11], s16
+; GFX942-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX942-NEXT:    s_cselect_b32 s5, s5, 0
+; GFX942-NEXT:    s_cselect_b32 s4, s4, 0
+; GFX942-NEXT:    s_lshr_b64 s[0:1], s[14:15], s20
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s1, s1, 0
+; GFX942-NEXT:    s_cselect_b32 s0, s0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s6
+; GFX942-NEXT:    v_mov_b32_e32 v3, s7
+; GFX942-NEXT:    v_mov_b32_e32 v4, s0
+; GFX942-NEXT:    v_mov_b32_e32 v5, s1
+; GFX942-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s18
+; GFX942-NEXT:    v_mov_b32_e32 v3, s17
+; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_mov_b32_e32 v5, s5
+; GFX942-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-NEXT:    s_endpgm
   %shift = lshr <2 x i128> %lhs, %rhs
   store <2 x i128> %shift, ptr addrspace(1) null
   ret void
@@ -659,6 +1146,69 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) {
 ; GCN-NEXT:    v_mov_b32_e32 v3, s4
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_ashr_v2i128_ss:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 16
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u64 s[18:19], 0
+; GFX942-NEXT:    v_cmp_lt_u64_e64 s[0:1], s[16:17], 64
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_sub_i32 s4, s16, 64
+; GFX942-NEXT:    s_sub_i32 s5, 64, s16
+; GFX942-NEXT:    s_and_b64 s[0:1], s[2:3], s[0:1]
+; GFX942-NEXT:    s_ashr_i64 s[2:3], s[10:11], s4
+; GFX942-NEXT:    s_lshl_b64 s[4:5], s[10:11], s5
+; GFX942-NEXT:    s_lshr_b64 s[6:7], s[8:9], s16
+; GFX942-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX942-NEXT:    s_and_b64 s[6:7], s[0:1], exec
+; GFX942-NEXT:    s_cselect_b32 s4, s4, s2
+; GFX942-NEXT:    s_cselect_b32 s5, s5, s3
+; GFX942-NEXT:    s_or_b64 s[2:3], s[16:17], s[18:19]
+; GFX942-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX942-NEXT:    s_cselect_b32 s17, s9, s5
+; GFX942-NEXT:    s_cselect_b32 s18, s8, s4
+; GFX942-NEXT:    s_cmp_eq_u64 s[22:23], 0
+; GFX942-NEXT:    v_cmp_lt_u64_e64 s[2:3], s[20:21], 64
+; GFX942-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX942-NEXT:    s_sub_i32 s6, 64, s20
+; GFX942-NEXT:    s_and_b64 s[2:3], s[4:5], s[2:3]
+; GFX942-NEXT:    s_sub_i32 s4, s20, 64
+; GFX942-NEXT:    s_lshl_b64 s[6:7], s[14:15], s6
+; GFX942-NEXT:    s_lshr_b64 s[8:9], s[12:13], s20
+; GFX942-NEXT:    s_ashr_i64 s[4:5], s[14:15], s4
+; GFX942-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX942-NEXT:    s_and_b64 s[8:9], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s6, s6, s4
+; GFX942-NEXT:    s_cselect_b32 s7, s7, s5
+; GFX942-NEXT:    s_or_b64 s[4:5], s[20:21], s[22:23]
+; GFX942-NEXT:    s_cmp_eq_u64 s[4:5], 0
+; GFX942-NEXT:    s_cselect_b32 s7, s13, s7
+; GFX942-NEXT:    s_cselect_b32 s6, s12, s6
+; GFX942-NEXT:    s_ashr_i32 s8, s11, 31
+; GFX942-NEXT:    s_ashr_i64 s[4:5], s[10:11], s16
+; GFX942-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX942-NEXT:    s_cselect_b32 s5, s5, s8
+; GFX942-NEXT:    s_cselect_b32 s4, s4, s8
+; GFX942-NEXT:    s_ashr_i32 s8, s15, 31
+; GFX942-NEXT:    s_ashr_i64 s[0:1], s[14:15], s20
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s1, s1, s8
+; GFX942-NEXT:    s_cselect_b32 s0, s0, s8
+; GFX942-NEXT:    v_mov_b32_e32 v2, s6
+; GFX942-NEXT:    v_mov_b32_e32 v3, s7
+; GFX942-NEXT:    v_mov_b32_e32 v4, s0
+; GFX942-NEXT:    v_mov_b32_e32 v5, s1
+; GFX942-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, s18
+; GFX942-NEXT:    v_mov_b32_e32 v3, s17
+; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_mov_b32_e32 v5, s5
+; GFX942-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-NEXT:    s_endpgm
   %shift = ashr <2 x i128> %lhs, %rhs
   store <2 x i128> %shift, ptr addrspace(1) null
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
index 761ff7786b98e..14a4a29f7f516 100644
--- a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
 
 declare void @llvm.dbg.value(metadata, metadata, metadata) #0
 
@@ -41,6 +42,43 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev
 ; GCN-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: __omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_eventsEv_l252_debug___omp_outlined_debug___omp_outlined:
+; GFX942:       .Lfunc_begin0:
+; GFX942-NEXT:    .cfi_sections .debug_frame
+; GFX942-NEXT:    .cfi_startproc
+; GFX942-NEXT:  ; %bb.0: ; %bb
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], 0
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v[2:3], off
+; GFX942-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 1, v0
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], vcc
+; GFX942-NEXT:    s_xor_b64 s[0:1], exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execnz .LBB0_3
+; GFX942-NEXT:  ; %bb.1: ; %Flow
+; GFX942-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX942-NEXT:    s_cbranch_execnz .LBB0_4
+; GFX942-NEXT:  .LBB0_2: ; %bb3
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+; GFX942-NEXT:  .LBB0_3: ; %bb2
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX942-NEXT:    ; implicit-def: $vgpr2_vgpr3
+; GFX942-NEXT:    s_andn2_saveexec_b64 s[0:1], s[0:1]
+; GFX942-NEXT:    s_cbranch_execz .LBB0_2
+; GFX942-NEXT:  .LBB0_4: ; %bb1
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX942-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %i = load ptr, ptr addrspace(1) null, align 8
   br i1 %arg, label %bb1, label %bb2
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index 308d87ba79052..71677f32eafd3 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -mattr=-flat-for-global -enable-ipra=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,FIJI %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -enable-ipra=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,HAWAII %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-flat-for-global -enable-ipra=0 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -mattr=-flat-for-global -enable-ipra=0 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
 
 ; FIXME: Why is this commuted only sometimes?
 define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
@@ -22,6 +23,12 @@ define fastcc i32 @i32_fastcc_i32_i32(i32 %arg0, i32 %arg1) #1 {
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: i32_fastcc_i32_i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %add0 = add i32 %arg0, %arg1
   ret i32 %add0
 }
@@ -53,6 +60,15 @@ define fastcc i32 @i32_fastcc_i32_i32_stack_object(i32 %arg0, i32 %arg1) #1 {
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: i32_fastcc_i32_i32_stack_object:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, 9
+; GFX942-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX942-NEXT:    scratch_store_dword off, v2, s32 offset:20 sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
   store volatile i32 9, ptr addrspace(5) %gep
@@ -70,6 +86,16 @@ define hidden fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %c
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX942-LABEL: sibling_call_i32_fastcc_i32_i32:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
   ret i32 %ret
@@ -87,6 +113,18 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_stack_object(i32 %a, i32 %b,
 ; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX942-LABEL: sibling_call_i32_fastcc_i32_i32_stack_object:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 9
+; GFX942-NEXT:    scratch_store_dword off, v2, s32 offset:20 sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -107,6 +145,18 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_callee_stack_object(i32 %a, i
 ; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:20
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX942-LABEL: sibling_call_i32_fastcc_i32_i32_callee_stack_object:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, i32_fastcc_i32_i32_stack_object at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, i32_fastcc_i32_i32_stack_object at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 9
+; GFX942-NEXT:    scratch_store_dword off, v2, s32 offset:20 sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -125,6 +175,16 @@ define fastcc void @sibling_call_i32_fastcc_i32_i32_unused_result(i32 %a, i32 %b
 ; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX942-LABEL: sibling_call_i32_fastcc_i32_i32_unused_result:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
   ret void
@@ -187,6 +247,20 @@ define amdgpu_kernel void @kernel_call_i32_fastcc_i32_i32_unused_result(i32 %a,
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GFX9-NEXT:    s_endpgm
+;
+; GFX942-LABEL: kernel_call_i32_fastcc_i32_i32_unused_result:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_getpc_b64 s[2:3]
+; GFX942-NEXT:    s_add_u32 s2, s2, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s3, s3, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[2:3], 0x0
+; GFX942-NEXT:    s_mov_b32 s32, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX942-NEXT:    s_endpgm
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
   ret void
@@ -216,6 +290,14 @@ define hidden fastcc i32 @i32_fastcc_i32_byval_i32(i32 %arg0, ptr addrspace(5) b
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: i32_fastcc_i32_byval_i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    scratch_load_dword v1, off, s32
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %arg1.load = load i32, ptr addrspace(5) %arg1, align 4
   %add0 = add i32 %arg0, %arg1.load
   ret i32 %add0
@@ -252,6 +334,36 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32_byval_parent(i32 %a, pt
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: sibling_call_i32_fastcc_i32_byval_i32_byval_parent:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s0, s33
+; GFX942-NEXT:    s_mov_b32 s33, s32
+; GFX942-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX942-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-NEXT:    scratch_load_dword v1, off, s33
+; GFX942-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX942-NEXT:    s_add_i32 s32, s32, 16
+; GFX942-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, i32_fastcc_i32_byval_i32 at rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, i32_fastcc_i32_byval_i32 at rel32@hi+12
+; GFX942-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    scratch_store_dword off, v1, s32
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX942-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX942-NEXT:    s_mov_b32 s32, s33
+; GFX942-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX942-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX942-NEXT:    scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload
+; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-NEXT:    s_mov_b32 s33, s0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) %b.byval)
   ret i32 %ret
@@ -271,6 +383,18 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_byval_i32(i32 %a, [32 x i32] %lar
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX942-LABEL: sibling_call_i32_fastcc_i32_byval_i32:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s0, 16
+; GFX942-NEXT:    scratch_load_dword v1, off, s0
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, i32_fastcc_i32_byval_i32 at rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, i32_fastcc_i32_byval_i32 at rel32@hi+12
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    scratch_store_dword off, v1, s32
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_byval_i32(i32 %a, ptr addrspace(5) byval(i32) inttoptr (i32 16 to ptr addrspace(5)))
   ret i32 %ret
@@ -310,6 +434,16 @@ define fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %arg0, i32 %arg1, [32 x i32] %l
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_add3_u32 v0, v0, v3, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: i32_fastcc_i32_i32_a32i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    scratch_load_dword v2, off, s32 offset:8
+; GFX942-NEXT:    scratch_load_dword v3, off, s32 offset:4
+; GFX942-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_add3_u32 v0, v0, v3, v2
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %val_firststack = extractvalue [32 x i32] %large, 30
   %val_laststack = extractvalue [32 x i32] %large, 31
   %add0 = add i32 %arg0, %arg1
@@ -338,6 +472,19 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x
 ; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX942-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    scratch_load_dwordx3 v[32:34], off, s32
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    scratch_store_dwordx3 off, v[32:34], s32
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] %c)
   ret i32 %ret
@@ -362,6 +509,21 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_a32i32_stack_object(i32 %a, i
 ; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX942-LABEL: sibling_call_i32_fastcc_i32_i32_a32i32_stack_object:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    scratch_load_dwordx3 v[32:34], off, s32
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v31, 9
+; GFX942-NEXT:    scratch_store_dword off, v31, s32 offset:32 sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    scratch_store_dwordx3 off, v[32:34], s32
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -435,6 +597,68 @@ define fastcc i32 @no_sibling_call_callee_more_stack_space(i32 %a, i32 %b) #1 {
 ; GCN-NEXT:    s_mov_b32 s33, s4
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: no_sibling_call_callee_more_stack_space:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s0, s33
+; GFX942-NEXT:    s_mov_b32 s33, s32
+; GFX942-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX942-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-NEXT:    s_add_i32 s32, s32, 16
+; GFX942-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX942-NEXT:    scratch_store_dwordx3 off, v[2:4], s32
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
+; GFX942-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-NEXT:    v_mov_b32_e32 v13, 0
+; GFX942-NEXT:    v_mov_b32_e32 v14, 0
+; GFX942-NEXT:    v_mov_b32_e32 v15, 0
+; GFX942-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-NEXT:    v_mov_b32_e32 v18, 0
+; GFX942-NEXT:    v_mov_b32_e32 v19, 0
+; GFX942-NEXT:    v_mov_b32_e32 v20, 0
+; GFX942-NEXT:    v_mov_b32_e32 v21, 0
+; GFX942-NEXT:    v_mov_b32_e32 v22, 0
+; GFX942-NEXT:    v_mov_b32_e32 v23, 0
+; GFX942-NEXT:    v_mov_b32_e32 v24, 0
+; GFX942-NEXT:    v_mov_b32_e32 v25, 0
+; GFX942-NEXT:    v_mov_b32_e32 v26, 0
+; GFX942-NEXT:    v_mov_b32_e32 v27, 0
+; GFX942-NEXT:    v_mov_b32_e32 v28, 0
+; GFX942-NEXT:    v_mov_b32_e32 v29, 0
+; GFX942-NEXT:    v_mov_b32_e32 v30, 0
+; GFX942-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX942-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX942-NEXT:    s_mov_b32 s32, s33
+; GFX942-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX942-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX942-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-NEXT:    s_mov_b32 s33, s0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %ret = tail call fastcc i32 @i32_fastcc_i32_i32_a32i32(i32 %a, i32 %b, [32 x i32] zeroinitializer)
   ret i32 %ret
@@ -481,6 +705,46 @@ define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i3
 ; GCN-NEXT:    s_mov_b64 exec, s[8:9]
 ; GCN-NEXT:    s_mov_b32 s33, s6
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX942-LABEL: sibling_call_i32_fastcc_i32_i32_other_call:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s0, s33
+; GFX942-NEXT:    s_mov_b32 s33, s32
+; GFX942-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX942-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
+; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-NEXT:    s_add_i32 s32, s32, 16
+; GFX942-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, i32_fastcc_i32_i32 at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, i32_fastcc_i32_i32 at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX942-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX942-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX942-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX942-NEXT:    v_mov_b32_e32 v40, v1
+; GFX942-NEXT:    v_mov_b32_e32 v41, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v2, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, v41
+; GFX942-NEXT:    v_mov_b32_e32 v1, v40
+; GFX942-NEXT:    scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
+; GFX942-NEXT:    scratch_load_dword v40, off, s33 offset:4 ; 4-byte Folded Reload
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, sibling_call_i32_fastcc_i32_i32 at rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, sibling_call_i32_fastcc_i32_i32 at rel32@hi+12
+; GFX942-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX942-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX942-NEXT:    s_mov_b32 s32, s33
+; GFX942-NEXT:    v_readlane_b32 s2, v42, 2
+; GFX942-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX942-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX942-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX942-NEXT:    s_mov_b32 s33, s2
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %other.call = tail call fastcc i32 @i32_fastcc_i32_i32(i32 %a, i32 %b)
   %ret = tail call fastcc i32 @sibling_call_i32_fastcc_i32_i32(i32 %a, i32 %b, i32 %other.call)
@@ -508,6 +772,21 @@ define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32(i32 %a, i3
 ; GCN-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX942-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    scratch_load_dwordx3 v[32:34], off, s32
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v31, 9
+; GFX942-NEXT:    scratch_store_dword off, v31, s32 offset:32 sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    scratch_store_dwordx3 off, v[32:34], s32
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -562,6 +841,52 @@ define fastcc i32 @sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg
 ; GCN-NEXT:    v_mov_b32_e32 v30, 0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[4:5]
+;
+; GFX942-LABEL: sibling_call_stack_objecti32_fastcc_i32_i32_a32i32_larger_arg_area:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, i32_fastcc_i32_i32_a32i32 at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, i32_fastcc_i32_i32_a32i32 at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 9
+; GFX942-NEXT:    scratch_store_dword off, v2, s32 offset:48 sc0 sc1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-NEXT:    scratch_store_dwordx3 off, v[2:4], s32
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
+; GFX942-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-NEXT:    v_mov_b32_e32 v13, 0
+; GFX942-NEXT:    v_mov_b32_e32 v14, 0
+; GFX942-NEXT:    v_mov_b32_e32 v15, 0
+; GFX942-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-NEXT:    v_mov_b32_e32 v18, 0
+; GFX942-NEXT:    v_mov_b32_e32 v19, 0
+; GFX942-NEXT:    v_mov_b32_e32 v20, 0
+; GFX942-NEXT:    v_mov_b32_e32 v21, 0
+; GFX942-NEXT:    v_mov_b32_e32 v22, 0
+; GFX942-NEXT:    v_mov_b32_e32 v23, 0
+; GFX942-NEXT:    v_mov_b32_e32 v24, 0
+; GFX942-NEXT:    v_mov_b32_e32 v25, 0
+; GFX942-NEXT:    v_mov_b32_e32 v26, 0
+; GFX942-NEXT:    v_mov_b32_e32 v27, 0
+; GFX942-NEXT:    v_mov_b32_e32 v28, 0
+; GFX942-NEXT:    v_mov_b32_e32 v29, 0
+; GFX942-NEXT:    v_mov_b32_e32 v30, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %alloca = alloca [16 x i32], align 4, addrspace(5)
   %gep = getelementptr inbounds [16 x i32], ptr addrspace(5) %alloca, i32 0, i32 5
@@ -585,6 +910,18 @@ define hidden fastcc i32 @indirect_uniform_sibling_call_i32_fastcc_i32_i32(i32 %
 ; GCN-NEXT:    s_load_dwordx2 s[16:17], s[16:17], 0x0
 ; GCN-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX942-LABEL: indirect_uniform_sibling_call_i32_fastcc_i32_i32:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, func_ptr_gv at gotpcrel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, func_ptr_gv at gotpcrel32@hi+12
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %func.ptr.load = load ptr, ptr addrspace(4) @func_ptr_gv
   %ret = tail call fastcc i32 %func.ptr.load(i32 %a, i32 %b)
@@ -866,6 +1203,99 @@ define hidden fastcc i32 @indirect_divergent_sibling_call_i32_fastcc_i32_i32(ptr
 ; GFX9-NEXT:    s_mov_b32 s33, s4
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: indirect_divergent_sibling_call_i32_fastcc_i32_i32:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s0, s33
+; GFX942-NEXT:    s_mov_b32 s33, s32
+; GFX942-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX942-NEXT:    scratch_store_dword off, v40, s33 ; 4-byte Folded Spill
+; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-NEXT:    v_writelane_b32 v40, s0, 18
+; GFX942-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX942-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX942-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX942-NEXT:    v_writelane_b32 v40, s35, 3
+; GFX942-NEXT:    v_writelane_b32 v40, s36, 4
+; GFX942-NEXT:    v_writelane_b32 v40, s37, 5
+; GFX942-NEXT:    v_writelane_b32 v40, s38, 6
+; GFX942-NEXT:    v_writelane_b32 v40, s39, 7
+; GFX942-NEXT:    v_writelane_b32 v40, s48, 8
+; GFX942-NEXT:    v_writelane_b32 v40, s49, 9
+; GFX942-NEXT:    v_writelane_b32 v40, s50, 10
+; GFX942-NEXT:    v_writelane_b32 v40, s51, 11
+; GFX942-NEXT:    v_writelane_b32 v40, s52, 12
+; GFX942-NEXT:    v_writelane_b32 v40, s53, 13
+; GFX942-NEXT:    v_writelane_b32 v40, s54, 14
+; GFX942-NEXT:    v_writelane_b32 v40, s55, 15
+; GFX942-NEXT:    v_writelane_b32 v40, s64, 16
+; GFX942-NEXT:    s_mov_b32 s50, s15
+; GFX942-NEXT:    s_mov_b32 s51, s14
+; GFX942-NEXT:    s_mov_b32 s52, s13
+; GFX942-NEXT:    s_mov_b32 s53, s12
+; GFX942-NEXT:    s_mov_b64 s[34:35], s[10:11]
+; GFX942-NEXT:    s_mov_b64 s[36:37], s[8:9]
+; GFX942-NEXT:    s_mov_b64 s[38:39], s[6:7]
+; GFX942-NEXT:    s_mov_b64 s[48:49], s[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-NEXT:    v_add_u32_e32 v2, v3, v4
+; GFX942-NEXT:    s_mov_b64 s[54:55], exec
+; GFX942-NEXT:    s_add_i32 s32, s32, 16
+; GFX942-NEXT:    v_writelane_b32 v40, s65, 17
+; GFX942-NEXT:  .LBB18_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s1, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, s[0:1], v[0:1]
+; GFX942-NEXT:    s_and_saveexec_b64 s[64:65], vcc
+; GFX942-NEXT:    s_mov_b64 s[4:5], s[48:49]
+; GFX942-NEXT:    s_mov_b64 s[6:7], s[38:39]
+; GFX942-NEXT:    s_mov_b64 s[8:9], s[36:37]
+; GFX942-NEXT:    s_mov_b64 s[10:11], s[34:35]
+; GFX942-NEXT:    s_mov_b32 s12, s53
+; GFX942-NEXT:    s_mov_b32 s13, s52
+; GFX942-NEXT:    s_mov_b32 s14, s51
+; GFX942-NEXT:    s_mov_b32 s15, s50
+; GFX942-NEXT:    v_mov_b32_e32 v0, v5
+; GFX942-NEXT:    v_mov_b32_e32 v1, v2
+; GFX942-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v3, v0
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:    ; implicit-def: $vgpr31
+; GFX942-NEXT:    ; implicit-def: $vgpr5
+; GFX942-NEXT:    ; implicit-def: $vgpr2
+; GFX942-NEXT:    s_xor_b64 exec, exec, s[64:65]
+; GFX942-NEXT:    s_cbranch_execnz .LBB18_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_mov_b64 exec, s[54:55]
+; GFX942-NEXT:    v_mov_b32_e32 v0, v3
+; GFX942-NEXT:    v_readlane_b32 s65, v40, 17
+; GFX942-NEXT:    v_readlane_b32 s64, v40, 16
+; GFX942-NEXT:    v_readlane_b32 s55, v40, 15
+; GFX942-NEXT:    v_readlane_b32 s54, v40, 14
+; GFX942-NEXT:    v_readlane_b32 s53, v40, 13
+; GFX942-NEXT:    v_readlane_b32 s52, v40, 12
+; GFX942-NEXT:    v_readlane_b32 s51, v40, 11
+; GFX942-NEXT:    v_readlane_b32 s50, v40, 10
+; GFX942-NEXT:    v_readlane_b32 s49, v40, 9
+; GFX942-NEXT:    v_readlane_b32 s48, v40, 8
+; GFX942-NEXT:    v_readlane_b32 s39, v40, 7
+; GFX942-NEXT:    v_readlane_b32 s38, v40, 6
+; GFX942-NEXT:    v_readlane_b32 s37, v40, 5
+; GFX942-NEXT:    v_readlane_b32 s36, v40, 4
+; GFX942-NEXT:    v_readlane_b32 s35, v40, 3
+; GFX942-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX942-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX942-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX942-NEXT:    s_mov_b32 s32, s33
+; GFX942-NEXT:    v_readlane_b32 s0, v40, 18
+; GFX942-NEXT:    s_or_saveexec_b64 s[2:3], -1
+; GFX942-NEXT:    scratch_load_dword v40, off, s33 ; 4-byte Folded Reload
+; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-NEXT:    s_mov_b32 s33, s0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
 entry:
   %add = add i32 %b, %c
   %ret = tail call fastcc i32 %func.ptr(i32 %a, i32 %add)
@@ -898,6 +1328,30 @@ define fastcc void @sibling_call_fastcc_multi_byval(i32 %a, [64 x i32]) #1 {
 ; GCN-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:16
 ; GCN-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:8
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX942-LABEL: sibling_call_fastcc_multi_byval:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s0, 0
+; GFX942-NEXT:    s_mov_b32 s2, s0
+; GFX942-NEXT:    s_mov_b32 s3, s0
+; GFX942-NEXT:    v_mov_b32_e32 v6, 9
+; GFX942-NEXT:    s_mov_b32 s1, s0
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v7, v6
+; GFX942-NEXT:    v_mov_b32_e32 v8, v6
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
+; GFX942-NEXT:    scratch_store_dwordx3 off, v[6:8], s32 offset:144
+; GFX942-NEXT:    scratch_store_dwordx4 off, v[2:5], s32 offset:160
+; GFX942-NEXT:    scratch_store_dword off, v6, s32 offset:8
+; GFX942-NEXT:    scratch_store_dwordx4 off, v[2:5], s32 offset:16
+; GFX942-NEXT:    scratch_load_dwordx2 v[2:3], off, s32 offset:144
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, void_fastcc_multi_byval at rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, void_fastcc_multi_byval at rel32@hi+12
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    scratch_store_dwordx2 off, v[2:3], s32
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %alloca0 = alloca [3 x i32], align 16, addrspace(5)
   %alloca1 = alloca [2 x i64], align 8, addrspace(5)
@@ -959,6 +1413,55 @@ define fastcc void @sibling_call_byval_and_stack_passed(i32 %stack.out.arg, [64
 ; GCN-NEXT:    v_mov_b32_e32 v29, 0
 ; GCN-NEXT:    v_mov_b32_e32 v30, 0
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX942-LABEL: sibling_call_byval_and_stack_passed:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v2, 9
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-NEXT:    scratch_store_dwordx3 off, v[2:4], s32 offset:144
+; GFX942-NEXT:    scratch_store_dword off, v0, s32 offset:16
+; GFX942-NEXT:    scratch_load_dwordx2 v[0:1], off, s32 offset:144
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    scratch_store_dwordx2 off, v[2:3], s32 offset:8
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, void_fastcc_byval_and_stack_passed at rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, void_fastcc_byval_and_stack_passed at rel32@hi+12
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
+; GFX942-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-NEXT:    v_mov_b32_e32 v13, 0
+; GFX942-NEXT:    v_mov_b32_e32 v14, 0
+; GFX942-NEXT:    v_mov_b32_e32 v15, 0
+; GFX942-NEXT:    v_mov_b32_e32 v16, 0
+; GFX942-NEXT:    v_mov_b32_e32 v17, 0
+; GFX942-NEXT:    v_mov_b32_e32 v18, 0
+; GFX942-NEXT:    v_mov_b32_e32 v19, 0
+; GFX942-NEXT:    v_mov_b32_e32 v20, 0
+; GFX942-NEXT:    v_mov_b32_e32 v21, 0
+; GFX942-NEXT:    v_mov_b32_e32 v22, 0
+; GFX942-NEXT:    v_mov_b32_e32 v23, 0
+; GFX942-NEXT:    v_mov_b32_e32 v24, 0
+; GFX942-NEXT:    v_mov_b32_e32 v25, 0
+; GFX942-NEXT:    v_mov_b32_e32 v26, 0
+; GFX942-NEXT:    v_mov_b32_e32 v27, 0
+; GFX942-NEXT:    v_mov_b32_e32 v28, 0
+; GFX942-NEXT:    v_mov_b32_e32 v29, 0
+; GFX942-NEXT:    v_mov_b32_e32 v30, 0
+; GFX942-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NEXT:    scratch_store_dwordx2 off, v[0:1], s32
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %alloca = alloca [3 x i32], align 16, addrspace(5)
   store [3 x i32] [i32 9, i32 9, i32 9], ptr addrspace(5) %alloca
@@ -976,6 +1479,14 @@ define hidden fastcc i64 @sibling_call_i64_fastcc_i64(i64 %a) #1 {
 ; GCN-NEXT:    s_add_u32 s16, s16, i64_fastcc_i64 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, i64_fastcc_i64 at rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX942-LABEL: sibling_call_i64_fastcc_i64:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, i64_fastcc_i64 at rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, i64_fastcc_i64 at rel32@hi+12
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %ret = tail call fastcc i64 @i64_fastcc_i64(i64 %a)
   ret i64 %ret
@@ -991,6 +1502,14 @@ define hidden fastcc ptr addrspace(1) @sibling_call_p1i8_fastcc_p1i8(ptr addrspa
 ; GCN-NEXT:    s_add_u32 s16, s16, p1i8_fastcc_p1i8 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, p1i8_fastcc_p1i8 at rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX942-LABEL: sibling_call_p1i8_fastcc_p1i8:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, p1i8_fastcc_p1i8 at rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, p1i8_fastcc_p1i8 at rel32@hi+12
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %ret = tail call fastcc ptr addrspace(1) @p1i8_fastcc_p1i8(ptr addrspace(1) %a)
   ret ptr addrspace(1) %ret
@@ -1006,6 +1525,14 @@ define hidden fastcc i16 @sibling_call_i16_fastcc_i16(i16 %a) #1 {
 ; GCN-NEXT:    s_add_u32 s16, s16, i16_fastcc_i16 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, i16_fastcc_i16 at rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX942-LABEL: sibling_call_i16_fastcc_i16:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, i16_fastcc_i16 at rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, i16_fastcc_i16 at rel32@hi+12
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %ret = tail call fastcc i16 @i16_fastcc_i16(i16 %a)
   ret i16 %ret
@@ -1021,6 +1548,14 @@ define hidden fastcc half @sibling_call_f16_fastcc_f16(half %a) #1 {
 ; GCN-NEXT:    s_add_u32 s16, s16, f16_fastcc_f16 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, f16_fastcc_f16 at rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX942-LABEL: sibling_call_f16_fastcc_f16:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, f16_fastcc_f16 at rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, f16_fastcc_f16 at rel32@hi+12
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %ret = tail call fastcc half @f16_fastcc_f16(half %a)
   ret half %ret
@@ -1036,6 +1571,14 @@ define hidden fastcc <3 x i16> @sibling_call_v3i16_fastcc_v3i16(<3 x i16> %a) #1
 ; GCN-NEXT:    s_add_u32 s16, s16, v3i16_fastcc_v3i16 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, v3i16_fastcc_v3i16 at rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX942-LABEL: sibling_call_v3i16_fastcc_v3i16:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, v3i16_fastcc_v3i16 at rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, v3i16_fastcc_v3i16 at rel32@hi+12
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %ret = tail call fastcc <3 x i16> @v3i16_fastcc_v3i16(<3 x i16> %a)
   ret <3 x i16> %ret
@@ -1051,6 +1594,14 @@ define hidden fastcc <4 x i16> @sibling_call_v4i16_fastcc_v4i16(<4 x i16> %a) #1
 ; GCN-NEXT:    s_add_u32 s16, s16, v4i16_fastcc_v4i16 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, v4i16_fastcc_v4i16 at rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX942-LABEL: sibling_call_v4i16_fastcc_v4i16:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, v4i16_fastcc_v4i16 at rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, v4i16_fastcc_v4i16 at rel32@hi+12
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %ret = tail call fastcc <4 x i16> @v4i16_fastcc_v4i16(<4 x i16> %a)
   ret <4 x i16> %ret
@@ -1066,6 +1617,14 @@ define hidden fastcc <2 x i64> @sibling_call_v2i64_fastcc_v2i64(<2 x i64> %a) #1
 ; GCN-NEXT:    s_add_u32 s16, s16, v2i64_fastcc_v2i64 at rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s17, s17, v2i64_fastcc_v2i64 at rel32@hi+12
 ; GCN-NEXT:    s_setpc_b64 s[16:17]
+;
+; GFX942-LABEL: sibling_call_v2i64_fastcc_v2i64:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_getpc_b64 s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s0, v2i64_fastcc_v2i64 at rel32@lo+4
+; GFX942-NEXT:    s_addc_u32 s1, s1, v2i64_fastcc_v2i64 at rel32@hi+12
+; GFX942-NEXT:    s_setpc_b64 s[0:1]
 entry:
   %ret = tail call fastcc <2 x i64> @v2i64_fastcc_v2i64(<2 x i64> %a)
   ret <2 x i64> %ret
diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index 6a45b961a61c8..101787abf8ea7 100644
--- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -enable-var-scope -check-prefixes=GCN,CI %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
@@ -32,6 +33,16 @@ define amdgpu_kernel void @sint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %in)
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sint_to_fp_i32_to_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f64_i32_e32 v[0:1], s2
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %result = sitofp i32 %in to double
   store double %result, ptr addrspace(1) %out
   ret void
@@ -73,6 +84,18 @@ define amdgpu_kernel void @sint_to_fp_i1_f64(ptr addrspace(1) %out, i32 %in) {
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sint_to_fp_i1_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX942-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %cmp = icmp eq i32 %in, 0
   %fp = sitofp i1 %cmp to double
   store double %fp, ptr addrspace(1) %out, align 4
@@ -113,6 +136,19 @@ define amdgpu_kernel void @sint_to_fp_i1_f64_load(ptr addrspace(1) %out, i1 %in)
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: sint_to_fp_i1_f64_load:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_bitcmp1_b32 s2, 0
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cvt_f64_i32_e32 v[0:1], v0
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %fp = sitofp i1 %in to double
   store double %fp, ptr addrspace(1) %out, align 8
   ret void
@@ -150,6 +186,18 @@ define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_sint_to_fp_i64_to_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f64_i32_e32 v[0:1], s3
+; GFX942-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 32
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %result = sitofp i64 %in to double
   store double %result, ptr addrspace(1) %out
   ret void
@@ -199,6 +247,22 @@ define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: v_sint_to_fp_i64_to_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cvt_f64_i32_e32 v[2:3], v1
+; GFX942-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 32
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX942-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
   %val = load i64, ptr addrspace(1) %gep, align 8
@@ -238,6 +302,17 @@ define amdgpu_kernel void @s_sint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_sint_to_fp_i8_to_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_sext_i32_i8 s2, s2
+; GFX942-NEXT:    v_cvt_f64_i32_e32 v[0:1], s2
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %fp = sitofp i8 %in to double
   store double %fp, ptr addrspace(1) %out
   ret void
@@ -258,6 +333,14 @@ define double @v_sint_to_fp_i8_to_f64(i8 %in) {
 ; VI-NEXT:    v_bfe_i32 v0, v0, 0, 16
 ; VI-NEXT:    v_cvt_f64_i32_e32 v[0:1], v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_sint_to_fp_i8_to_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_bfe_i32 v0, v0, 0, 8
+; GFX942-NEXT:    v_bfe_i32 v0, v0, 0, 16
+; GFX942-NEXT:    v_cvt_f64_i32_e32 v[0:1], v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %fp = sitofp i8 %in to double
   ret double %fp
   }
@@ -296,6 +379,18 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_select_sint_to_fp_i1_vals_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX942-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, double -1.0, double 0.0
   store double %select, ptr addrspace(1) %out, align 8
@@ -313,6 +408,18 @@ define void @v_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_select_sint_to_fp_i1_vals_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0xbff00000
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
+; GFX942-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, double -1.0, double 0.0
   store double %select, ptr addrspace(1) %out, align 8
@@ -353,6 +460,18 @@ define amdgpu_kernel void @s_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_select_sint_to_fp_i1_vals_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX942-NEXT:    s_cselect_b32 s2, 0xbff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, i64 u0xbff0000000000000, i64 0
   store i64 %select, ptr addrspace(1) %out, align 8
@@ -370,6 +489,18 @@ define void @v_select_sint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_select_sint_to_fp_i1_vals_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0xbff00000
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
+; GFX942-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, i64 u0xbff0000000000000, i64 0
   store i64 %select, ptr addrspace(1) %out, align 8
@@ -388,6 +519,18 @@ define void @v_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_swap_select_sint_to_fp_i1_vals_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0xbff00000
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, v3, 0, vcc
+; GFX942-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, double 0.0, double -1.0
   store double %select, ptr addrspace(1) %out, align 8
@@ -429,6 +572,18 @@ define amdgpu_kernel void @s_swap_select_sint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_swap_select_sint_to_fp_i1_vals_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX942-NEXT:    s_cselect_b32 s2, 0, 0xbff00000
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, double 0.0, double -1.0
   store double %select, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index f614f58d8e1dc..d9507161fab76 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s --check-prefixes=GCN
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s --check-prefixes=GFX942
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck %s --check-prefixes=TAHITI
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck %s --check-prefixes=TONGA
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck %s --check-prefixes=EG
@@ -23,6 +24,24 @@ define amdgpu_kernel void @srem_i16_7(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    global_store_short v0, v1, s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: srem_i16_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_ushort v1, v0, s[2:3]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX942-NEXT:    s_sext_i32_i16 s2, s2
+; GFX942-NEXT:    s_mulk_i32 s2, 0x4925
+; GFX942-NEXT:    s_lshr_b32 s3, s2, 31
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 17
+; GFX942-NEXT:    s_add_i32 s2, s2, s3
+; GFX942-NEXT:    s_mul_i32 s2, s2, 7
+; GFX942-NEXT:    v_subrev_u32_e32 v1, s2, v1
+; GFX942-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; TAHITI-LABEL: srem_i16_7:
 ; TAHITI:       ; %bb.0:
 ; TAHITI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -147,6 +166,44 @@ define amdgpu_kernel void @srem_i32(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    global_store_dword v2, v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: srem_i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX942-NEXT:    s_abs_i32 s2, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s2
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX942-NEXT:    s_sub_i32 s5, 0, s2
+; GFX942-NEXT:    s_ashr_i32 s4, s3, 31
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT:    s_abs_i32 s3, s3
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX942-NEXT:    s_mul_i32 s5, s5, s6
+; GFX942-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX942-NEXT:    s_add_i32 s6, s6, s5
+; GFX942-NEXT:    s_mul_hi_u32 s5, s3, s6
+; GFX942-NEXT:    s_mul_i32 s5, s5, s2
+; GFX942-NEXT:    s_sub_i32 s3, s3, s5
+; GFX942-NEXT:    s_sub_i32 s5, s3, s2
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX942-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX942-NEXT:    s_sub_i32 s5, s3, s2
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s3
+; GFX942-NEXT:    s_xor_b32 s2, s2, s4
+; GFX942-NEXT:    s_sub_i32 s2, s2, s4
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; TAHITI-LABEL: srem_i32:
 ; TAHITI:       ; %bb.0:
 ; TAHITI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -290,6 +347,21 @@ define amdgpu_kernel void @srem_i32_4(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: srem_i32_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 30, v2
+; GFX942-NEXT:    v_add_u32_e32 v2, v1, v2
+; GFX942-NEXT:    v_and_b32_e32 v2, -4, v2
+; GFX942-NEXT:    v_sub_u32_e32 v1, v1, v2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; TAHITI-LABEL: srem_i32_4:
 ; TAHITI:       ; %bb.0:
 ; TAHITI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -379,6 +451,24 @@ define amdgpu_kernel void @srem_i32_7(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: srem_i32_7:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dword v1, v0, s[2:3]
+; GFX942-NEXT:    s_mov_b32 s2, 0x92492493
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_mul_hi_i32 v2, v1, s2
+; GFX942-NEXT:    v_add_u32_e32 v2, v2, v1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 31, v2
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 2, v2
+; GFX942-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX942-NEXT:    v_mul_lo_u32 v2, v2, 7
+; GFX942-NEXT:    v_sub_u32_e32 v1, v1, v2
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; TAHITI-LABEL: srem_i32_7:
 ; TAHITI:       ; %bb.0:
 ; TAHITI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -519,6 +609,71 @@ define amdgpu_kernel void @srem_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: srem_v2i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX942-NEXT:    s_abs_i32 s2, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX942-NEXT:    s_sub_i32 s6, 0, s2
+; GFX942-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX942-NEXT:    s_abs_i32 s3, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v3
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX942-NEXT:    s_mul_i32 s6, s6, s7
+; GFX942-NEXT:    s_mul_hi_u32 s6, s7, s6
+; GFX942-NEXT:    s_add_i32 s7, s7, s6
+; GFX942-NEXT:    s_mul_hi_u32 s6, s3, s7
+; GFX942-NEXT:    s_mul_i32 s6, s6, s2
+; GFX942-NEXT:    s_sub_i32 s3, s3, s6
+; GFX942-NEXT:    s_sub_i32 s6, s3, s2
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX942-NEXT:    s_cselect_b32 s3, s6, s3
+; GFX942-NEXT:    s_sub_i32 s6, s3, s2
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX942-NEXT:    s_cselect_b32 s2, s6, s3
+; GFX942-NEXT:    s_abs_i32 s3, s4
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT:    s_xor_b32 s2, s2, s5
+; GFX942-NEXT:    s_sub_i32 s7, 0, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s5
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    s_ashr_i32 s6, s4, 31
+; GFX942-NEXT:    s_abs_i32 s4, s4
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s7, s7, s5
+; GFX942-NEXT:    s_mul_hi_u32 s7, s5, s7
+; GFX942-NEXT:    s_add_i32 s5, s5, s7
+; GFX942-NEXT:    s_mul_hi_u32 s5, s4, s5
+; GFX942-NEXT:    s_mul_i32 s5, s5, s3
+; GFX942-NEXT:    s_sub_i32 s4, s4, s5
+; GFX942-NEXT:    s_sub_i32 s5, s4, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s4, s3
+; GFX942-NEXT:    s_cselect_b32 s4, s5, s4
+; GFX942-NEXT:    s_sub_i32 s5, s4, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s4, s3
+; GFX942-NEXT:    s_cselect_b32 s3, s5, s4
+; GFX942-NEXT:    s_xor_b32 s3, s3, s6
+; GFX942-NEXT:    s_sub_i32 s3, s3, s6
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; TAHITI-LABEL: srem_v2i32:
 ; TAHITI:       ; %bb.0:
 ; TAHITI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
@@ -745,6 +900,30 @@ define amdgpu_kernel void @srem_v2i32_4(ptr addrspace(1) %out, ptr addrspace(1)
 ; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: srem_v2i32_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v2, s[2:3]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX942-NEXT:    s_ashr_i32 s4, s2, 31
+; GFX942-NEXT:    s_ashr_i32 s5, s3, 31
+; GFX942-NEXT:    s_lshr_b32 s4, s4, 30
+; GFX942-NEXT:    s_lshr_b32 s5, s5, 30
+; GFX942-NEXT:    s_add_i32 s4, s2, s4
+; GFX942-NEXT:    s_add_i32 s5, s3, s5
+; GFX942-NEXT:    s_and_b32 s4, s4, -4
+; GFX942-NEXT:    s_and_b32 s5, s5, -4
+; GFX942-NEXT:    s_sub_i32 s2, s2, s4
+; GFX942-NEXT:    s_sub_i32 s3, s3, s5
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; TAHITI-LABEL: srem_v2i32_4:
 ; TAHITI:       ; %bb.0:
 ; TAHITI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -956,6 +1135,124 @@ define amdgpu_kernel void @srem_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: srem_v4i32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx4 v[4:7], v8, s[2:3] offset:16
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v8, s[2:3]
+; GFX942-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v4
+; GFX942-NEXT:    s_abs_i32 s2, s2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v4, s2
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    s_sub_i32 s6, 0, s2
+; GFX942-NEXT:    s_ashr_i32 s5, s4, 31
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v4, v4
+; GFX942-NEXT:    s_abs_i32 s4, s4
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v5
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v4
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX942-NEXT:    s_mul_i32 s6, s6, s7
+; GFX942-NEXT:    s_mul_hi_u32 s6, s7, s6
+; GFX942-NEXT:    s_add_i32 s7, s7, s6
+; GFX942-NEXT:    s_mul_hi_u32 s6, s4, s7
+; GFX942-NEXT:    s_mul_i32 s6, s6, s2
+; GFX942-NEXT:    s_sub_i32 s4, s4, s6
+; GFX942-NEXT:    s_sub_i32 s6, s4, s2
+; GFX942-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX942-NEXT:    s_cselect_b32 s4, s6, s4
+; GFX942-NEXT:    s_sub_i32 s6, s4, s2
+; GFX942-NEXT:    s_cmp_ge_u32 s4, s2
+; GFX942-NEXT:    s_cselect_b32 s2, s6, s4
+; GFX942-NEXT:    s_abs_i32 s3, s3
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT:    s_xor_b32 s2, s2, s5
+; GFX942-NEXT:    s_sub_i32 s8, 0, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s5
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX942-NEXT:    s_ashr_i32 s7, s6, 31
+; GFX942-NEXT:    s_abs_i32 s6, s6
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v6
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s8, s8, s5
+; GFX942-NEXT:    s_mul_hi_u32 s8, s5, s8
+; GFX942-NEXT:    s_add_i32 s5, s5, s8
+; GFX942-NEXT:    s_mul_hi_u32 s5, s6, s5
+; GFX942-NEXT:    s_mul_i32 s5, s5, s3
+; GFX942-NEXT:    s_sub_i32 s5, s6, s5
+; GFX942-NEXT:    s_sub_i32 s6, s5, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s5, s3
+; GFX942-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX942-NEXT:    s_sub_i32 s6, s5, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s5, s3
+; GFX942-NEXT:    s_cselect_b32 s3, s6, s5
+; GFX942-NEXT:    s_abs_i32 s4, s4
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX942-NEXT:    s_xor_b32 s3, s3, s7
+; GFX942-NEXT:    s_sub_i32 s9, 0, s4
+; GFX942-NEXT:    s_sub_i32 s3, s3, s7
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-NEXT:    s_ashr_i32 s8, s6, 31
+; GFX942-NEXT:    s_abs_i32 s6, s6
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v7
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX942-NEXT:    s_mul_i32 s9, s9, s7
+; GFX942-NEXT:    s_mul_hi_u32 s9, s7, s9
+; GFX942-NEXT:    s_add_i32 s7, s7, s9
+; GFX942-NEXT:    s_mul_hi_u32 s7, s6, s7
+; GFX942-NEXT:    s_mul_i32 s7, s7, s4
+; GFX942-NEXT:    s_sub_i32 s6, s6, s7
+; GFX942-NEXT:    s_sub_i32 s7, s6, s4
+; GFX942-NEXT:    s_cmp_ge_u32 s6, s4
+; GFX942-NEXT:    s_cselect_b32 s6, s7, s6
+; GFX942-NEXT:    s_sub_i32 s7, s6, s4
+; GFX942-NEXT:    s_cmp_ge_u32 s6, s4
+; GFX942-NEXT:    s_cselect_b32 s4, s7, s6
+; GFX942-NEXT:    s_abs_i32 s5, s5
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s5
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v3
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    s_ashr_i32 s2, s6, 31
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_abs_i32 s3, s6
+; GFX942-NEXT:    s_sub_i32 s6, 0, s5
+; GFX942-NEXT:    s_xor_b32 s4, s4, s8
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_sub_i32 s4, s4, s8
+; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX942-NEXT:    s_mul_i32 s6, s6, s7
+; GFX942-NEXT:    s_mul_hi_u32 s6, s7, s6
+; GFX942-NEXT:    s_add_i32 s7, s7, s6
+; GFX942-NEXT:    s_mul_hi_u32 s6, s3, s7
+; GFX942-NEXT:    s_mul_i32 s6, s6, s5
+; GFX942-NEXT:    s_sub_i32 s3, s3, s6
+; GFX942-NEXT:    s_sub_i32 s6, s3, s5
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s5
+; GFX942-NEXT:    s_cselect_b32 s3, s6, s3
+; GFX942-NEXT:    s_sub_i32 s6, s3, s5
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s5
+; GFX942-NEXT:    s_cselect_b32 s3, s6, s3
+; GFX942-NEXT:    s_xor_b32 s3, s3, s2
+; GFX942-NEXT:    s_sub_i32 s2, s3, s2
+; GFX942-NEXT:    v_mov_b32_e32 v5, s2
+; GFX942-NEXT:    global_store_dwordx4 v8, v[2:5], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; TAHITI-LABEL: srem_v4i32:
 ; TAHITI:       ; %bb.0:
 ; TAHITI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
@@ -1353,6 +1650,44 @@ define amdgpu_kernel void @srem_v4i32_4(ptr addrspace(1) %out, ptr addrspace(1)
 ; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: srem_v4i32_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v4, s[2:3]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX942-NEXT:    s_ashr_i32 s6, s2, 31
+; GFX942-NEXT:    s_ashr_i32 s7, s3, 31
+; GFX942-NEXT:    s_ashr_i32 s8, s4, 31
+; GFX942-NEXT:    s_ashr_i32 s9, s5, 31
+; GFX942-NEXT:    s_lshr_b32 s6, s6, 30
+; GFX942-NEXT:    s_lshr_b32 s7, s7, 30
+; GFX942-NEXT:    s_lshr_b32 s8, s8, 30
+; GFX942-NEXT:    s_lshr_b32 s9, s9, 30
+; GFX942-NEXT:    s_add_i32 s6, s2, s6
+; GFX942-NEXT:    s_add_i32 s7, s3, s7
+; GFX942-NEXT:    s_add_i32 s8, s4, s8
+; GFX942-NEXT:    s_add_i32 s9, s5, s9
+; GFX942-NEXT:    s_and_b32 s6, s6, -4
+; GFX942-NEXT:    s_and_b32 s7, s7, -4
+; GFX942-NEXT:    s_and_b32 s8, s8, -4
+; GFX942-NEXT:    s_and_b32 s9, s9, -4
+; GFX942-NEXT:    s_sub_i32 s2, s2, s6
+; GFX942-NEXT:    s_sub_i32 s3, s3, s7
+; GFX942-NEXT:    s_sub_i32 s4, s4, s8
+; GFX942-NEXT:    s_sub_i32 s5, s5, s9
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    v_mov_b32_e32 v1, s3
+; GFX942-NEXT:    v_mov_b32_e32 v2, s4
+; GFX942-NEXT:    v_mov_b32_e32 v3, s5
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; TAHITI-LABEL: srem_v4i32_4:
 ; TAHITI:       ; %bb.0:
 ; TAHITI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
@@ -1673,6 +2008,194 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GCN-NEXT:    s_branch .LBB8_2
 ;
+; GFX942-LABEL: srem_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v0, s[10:11]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_readfirstlane_b32 s7, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX942-NEXT:    s_or_b64 s[0:1], s[6:7], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s0, 0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_cbranch_scc0 .LBB8_4
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_ashr_i32 s0, s5, 31
+; GFX942-NEXT:    s_add_u32 s2, s4, s0
+; GFX942-NEXT:    s_mov_b32 s1, s0
+; GFX942-NEXT:    s_addc_u32 s3, s5, s0
+; GFX942-NEXT:    s_xor_b64 s[12:13], s[2:3], s[0:1]
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s12
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s13
+; GFX942-NEXT:    s_sub_u32 s0, 0, s12
+; GFX942-NEXT:    s_subb_u32 s1, 0, s13
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX942-NEXT:    s_mul_i32 s5, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s15, s0, s3
+; GFX942-NEXT:    s_mul_i32 s14, s1, s3
+; GFX942-NEXT:    s_add_i32 s5, s15, s5
+; GFX942-NEXT:    s_add_i32 s5, s5, s14
+; GFX942-NEXT:    s_mul_i32 s16, s0, s3
+; GFX942-NEXT:    s_mul_hi_u32 s14, s3, s5
+; GFX942-NEXT:    s_mul_i32 s15, s3, s5
+; GFX942-NEXT:    s_mul_hi_u32 s3, s3, s16
+; GFX942-NEXT:    s_add_u32 s3, s3, s15
+; GFX942-NEXT:    s_addc_u32 s14, 0, s14
+; GFX942-NEXT:    s_mul_hi_u32 s17, s2, s16
+; GFX942-NEXT:    s_mul_i32 s16, s2, s16
+; GFX942-NEXT:    s_add_u32 s3, s3, s16
+; GFX942-NEXT:    s_mul_hi_u32 s15, s2, s5
+; GFX942-NEXT:    s_addc_u32 s3, s14, s17
+; GFX942-NEXT:    s_addc_u32 s14, s15, 0
+; GFX942-NEXT:    s_mul_i32 s5, s2, s5
+; GFX942-NEXT:    s_add_u32 s3, s3, s5
+; GFX942-NEXT:    s_addc_u32 s5, 0, s14
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, s3, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s2, s5
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s3, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s14, s0, s5
+; GFX942-NEXT:    s_add_i32 s3, s14, s3
+; GFX942-NEXT:    s_mul_i32 s1, s1, s5
+; GFX942-NEXT:    s_add_i32 s3, s3, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s5
+; GFX942-NEXT:    s_mul_hi_u32 s14, s2, s0
+; GFX942-NEXT:    s_mul_i32 s15, s2, s0
+; GFX942-NEXT:    s_mul_i32 s17, s5, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s5, s0
+; GFX942-NEXT:    s_mul_hi_u32 s16, s5, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s17
+; GFX942-NEXT:    s_addc_u32 s5, 0, s16
+; GFX942-NEXT:    s_add_u32 s0, s0, s15
+; GFX942-NEXT:    s_mul_hi_u32 s1, s2, s3
+; GFX942-NEXT:    s_addc_u32 s0, s5, s14
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s3, s2, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s3
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s2, s1
+; GFX942-NEXT:    s_ashr_i32 s14, s7, 31
+; GFX942-NEXT:    s_add_u32 s0, s6, s14
+; GFX942-NEXT:    s_mov_b32 s15, s14
+; GFX942-NEXT:    s_addc_u32 s1, s7, s14
+; GFX942-NEXT:    s_xor_b64 s[16:17], s[0:1], s[14:15]
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX942-NEXT:    s_mul_i32 s1, s16, s2
+; GFX942-NEXT:    s_mul_hi_u32 s5, s16, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s16, s2
+; GFX942-NEXT:    s_add_u32 s1, s5, s1
+; GFX942-NEXT:    s_addc_u32 s0, 0, s0
+; GFX942-NEXT:    s_mul_hi_u32 s7, s17, s3
+; GFX942-NEXT:    s_mul_i32 s3, s17, s3
+; GFX942-NEXT:    s_add_u32 s1, s1, s3
+; GFX942-NEXT:    s_mul_hi_u32 s5, s17, s2
+; GFX942-NEXT:    s_addc_u32 s0, s0, s7
+; GFX942-NEXT:    s_addc_u32 s1, s5, 0
+; GFX942-NEXT:    s_mul_i32 s2, s17, s2
+; GFX942-NEXT:    s_add_u32 s0, s0, s2
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    s_mul_i32 s1, s12, s1
+; GFX942-NEXT:    s_mul_hi_u32 s2, s12, s0
+; GFX942-NEXT:    s_add_i32 s1, s2, s1
+; GFX942-NEXT:    s_mul_i32 s2, s13, s0
+; GFX942-NEXT:    s_mul_i32 s0, s12, s0
+; GFX942-NEXT:    s_add_i32 s5, s1, s2
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    s_sub_i32 s1, s17, s5
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, s16, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s7, s1, s13
+; GFX942-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s12, v0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s15, s7, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s15, s13
+; GFX942-NEXT:    s_cselect_b32 s16, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v1
+; GFX942-NEXT:    s_cmp_eq_u32 s15, s13
+; GFX942-NEXT:    v_mov_b32_e32 v3, s16
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[2:3]
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[2:3]
+; GFX942-NEXT:    s_subb_u32 s2, s7, s13
+; GFX942-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s12, v1
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s2, s2, 0
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, s15
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s17, s5
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s13
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s12, v0
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s13
+; GFX942-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX942-NEXT:    v_mov_b32_e32 v4, s0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v0, s14, v0
+; GFX942-NEXT:    v_xor_b32_e32 v1, s14, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, s14
+; GFX942-NEXT:    v_subrev_co_u32_e32 v0, vcc, s14, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX942-NEXT:  .LBB8_2:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX942-NEXT:    s_sub_i32 s0, 0, s4
+; GFX942-NEXT:    s_mov_b32 s1, 0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX942-NEXT:    s_mul_i32 s0, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s0, s2, s0
+; GFX942-NEXT:    s_add_i32 s2, s2, s0
+; GFX942-NEXT:    s_mul_hi_u32 s0, s6, s2
+; GFX942-NEXT:    s_mul_i32 s0, s0, s4
+; GFX942-NEXT:    s_sub_i32 s0, s6, s0
+; GFX942-NEXT:    s_sub_i32 s2, s0, s4
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s4
+; GFX942-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX942-NEXT:    s_sub_i32 s2, s0, s4
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s4
+; GFX942-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT:  .LBB8_3:
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    s_endpgm
+; GFX942-NEXT:  .LBB8_4:
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:    s_branch .LBB8_2
+;
 ; TAHITI-LABEL: srem_i64:
 ; TAHITI:       ; %bb.0:
 ; TAHITI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
@@ -2604,6 +3127,23 @@ define amdgpu_kernel void @srem_i64_4(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: srem_i64_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[2:3], v1, s[2:3]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 30, v0
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[2:3], 0, v[0:1]
+; GFX942-NEXT:    v_and_b32_e32 v0, -4, v4
+; GFX942-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v5, vcc
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; TAHITI-LABEL: srem_i64_4:
 ; TAHITI:       ; %bb.0:
 ; TAHITI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -3037,6 +3577,371 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:  .LBB10_8:
 ; GCN-NEXT:    s_branch .LBB10_5
 ;
+; GFX942-LABEL: srem_v2i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v8, s[10:11] offset:16
+; GFX942-NEXT:    global_load_dwordx4 v[4:7], v8, s[10:11]
+; GFX942-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NEXT:    v_readfirstlane_b32 s11, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s10, v0
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_readfirstlane_b32 s13, v5
+; GFX942-NEXT:    v_readfirstlane_b32 s12, v4
+; GFX942-NEXT:    s_or_b64 s[0:1], s[12:13], s[10:11]
+; GFX942-NEXT:    s_mov_b32 s0, 0
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v3
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s7, v7
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v6
+; GFX942-NEXT:    s_cbranch_scc0 .LBB10_7
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_ashr_i32 s0, s11, 31
+; GFX942-NEXT:    s_add_u32 s2, s10, s0
+; GFX942-NEXT:    s_mov_b32 s1, s0
+; GFX942-NEXT:    s_addc_u32 s3, s11, s0
+; GFX942-NEXT:    s_xor_b64 s[16:17], s[2:3], s[0:1]
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s16
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s17
+; GFX942-NEXT:    s_sub_u32 s0, 0, s16
+; GFX942-NEXT:    s_subb_u32 s1, 0, s17
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX942-NEXT:    s_mul_i32 s11, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s19, s0, s3
+; GFX942-NEXT:    s_mul_i32 s18, s1, s3
+; GFX942-NEXT:    s_add_i32 s11, s19, s11
+; GFX942-NEXT:    s_add_i32 s11, s11, s18
+; GFX942-NEXT:    s_mul_i32 s20, s0, s3
+; GFX942-NEXT:    s_mul_hi_u32 s18, s3, s11
+; GFX942-NEXT:    s_mul_i32 s19, s3, s11
+; GFX942-NEXT:    s_mul_hi_u32 s3, s3, s20
+; GFX942-NEXT:    s_add_u32 s3, s3, s19
+; GFX942-NEXT:    s_addc_u32 s18, 0, s18
+; GFX942-NEXT:    s_mul_hi_u32 s21, s2, s20
+; GFX942-NEXT:    s_mul_i32 s20, s2, s20
+; GFX942-NEXT:    s_add_u32 s3, s3, s20
+; GFX942-NEXT:    s_mul_hi_u32 s19, s2, s11
+; GFX942-NEXT:    s_addc_u32 s3, s18, s21
+; GFX942-NEXT:    s_addc_u32 s18, s19, 0
+; GFX942-NEXT:    s_mul_i32 s11, s2, s11
+; GFX942-NEXT:    s_add_u32 s3, s3, s11
+; GFX942-NEXT:    s_addc_u32 s11, 0, s18
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, s3, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s2, s11
+; GFX942-NEXT:    v_readfirstlane_b32 s11, v0
+; GFX942-NEXT:    s_mul_i32 s3, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s18, s0, s11
+; GFX942-NEXT:    s_add_i32 s3, s18, s3
+; GFX942-NEXT:    s_mul_i32 s1, s1, s11
+; GFX942-NEXT:    s_add_i32 s3, s3, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s11
+; GFX942-NEXT:    s_mul_hi_u32 s18, s2, s0
+; GFX942-NEXT:    s_mul_i32 s19, s2, s0
+; GFX942-NEXT:    s_mul_i32 s21, s11, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s11, s0
+; GFX942-NEXT:    s_mul_hi_u32 s20, s11, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s21
+; GFX942-NEXT:    s_addc_u32 s11, 0, s20
+; GFX942-NEXT:    s_add_u32 s0, s0, s19
+; GFX942-NEXT:    s_mul_hi_u32 s1, s2, s3
+; GFX942-NEXT:    s_addc_u32 s0, s11, s18
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s3, s2, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s3
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s2, s1
+; GFX942-NEXT:    s_ashr_i32 s18, s13, 31
+; GFX942-NEXT:    s_add_u32 s0, s12, s18
+; GFX942-NEXT:    s_mov_b32 s19, s18
+; GFX942-NEXT:    s_addc_u32 s1, s13, s18
+; GFX942-NEXT:    s_xor_b64 s[20:21], s[0:1], s[18:19]
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX942-NEXT:    s_mul_i32 s1, s20, s2
+; GFX942-NEXT:    s_mul_hi_u32 s11, s20, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s20, s2
+; GFX942-NEXT:    s_add_u32 s1, s11, s1
+; GFX942-NEXT:    s_addc_u32 s0, 0, s0
+; GFX942-NEXT:    s_mul_hi_u32 s13, s21, s3
+; GFX942-NEXT:    s_mul_i32 s3, s21, s3
+; GFX942-NEXT:    s_add_u32 s1, s1, s3
+; GFX942-NEXT:    s_mul_hi_u32 s11, s21, s2
+; GFX942-NEXT:    s_addc_u32 s0, s0, s13
+; GFX942-NEXT:    s_addc_u32 s1, s11, 0
+; GFX942-NEXT:    s_mul_i32 s2, s21, s2
+; GFX942-NEXT:    s_add_u32 s0, s0, s2
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    s_mul_i32 s1, s16, s1
+; GFX942-NEXT:    s_mul_hi_u32 s2, s16, s0
+; GFX942-NEXT:    s_add_i32 s1, s2, s1
+; GFX942-NEXT:    s_mul_i32 s2, s17, s0
+; GFX942-NEXT:    s_mul_i32 s0, s16, s0
+; GFX942-NEXT:    s_add_i32 s11, s1, s2
+; GFX942-NEXT:    v_mov_b32_e32 v0, s0
+; GFX942-NEXT:    s_sub_i32 s1, s21, s11
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, s20, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s13, s1, s17
+; GFX942-NEXT:    v_subrev_co_u32_e64 v1, s[0:1], s16, v0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s19, s13, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s19, s17
+; GFX942-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[2:3], s16, v1
+; GFX942-NEXT:    s_cmp_eq_u32 s19, s17
+; GFX942-NEXT:    v_mov_b32_e32 v3, s20
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[2:3]
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[2:3]
+; GFX942-NEXT:    s_subb_u32 s2, s13, s17
+; GFX942-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s16, v1
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s2, s2, 0
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, s19
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v1, v3, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s21, s11
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s17
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s16, v0
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s17
+; GFX942-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX942-NEXT:    v_mov_b32_e32 v4, s0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v0, s18, v0
+; GFX942-NEXT:    v_xor_b32_e32 v1, s18, v2
+; GFX942-NEXT:    v_mov_b32_e32 v2, s18
+; GFX942-NEXT:    v_subrev_co_u32_e32 v0, vcc, s18, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-NEXT:    s_cbranch_execnz .LBB10_3
+; GFX942-NEXT:  .LBB10_2:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s10
+; GFX942-NEXT:    s_sub_i32 s0, 0, s10
+; GFX942-NEXT:    s_mov_b32 s1, 0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX942-NEXT:    s_mul_i32 s0, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s0, s2, s0
+; GFX942-NEXT:    s_add_i32 s2, s2, s0
+; GFX942-NEXT:    s_mul_hi_u32 s0, s12, s2
+; GFX942-NEXT:    s_mul_i32 s0, s0, s10
+; GFX942-NEXT:    s_sub_i32 s0, s12, s0
+; GFX942-NEXT:    s_sub_i32 s2, s0, s10
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s10
+; GFX942-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX942-NEXT:    s_sub_i32 s2, s0, s10
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s10
+; GFX942-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
+; GFX942-NEXT:  .LBB10_3:
+; GFX942-NEXT:    s_or_b64 s[0:1], s[6:7], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s0, 0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_cbranch_scc0 .LBB10_8
+; GFX942-NEXT:  ; %bb.4:
+; GFX942-NEXT:    s_ashr_i32 s0, s5, 31
+; GFX942-NEXT:    s_add_u32 s2, s4, s0
+; GFX942-NEXT:    s_mov_b32 s1, s0
+; GFX942-NEXT:    s_addc_u32 s3, s5, s0
+; GFX942-NEXT:    s_xor_b64 s[12:13], s[2:3], s[0:1]
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, s12
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, s13
+; GFX942-NEXT:    s_sub_u32 s0, 0, s12
+; GFX942-NEXT:    s_subb_u32 s1, 0, s13
+; GFX942-NEXT:    v_fmamk_f32 v2, v3, 0x4f800000, v2
+; GFX942-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX942-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fmamk_f32 v2, v3, 0xcf800000, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v3
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX942-NEXT:    s_mul_i32 s5, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s15, s0, s3
+; GFX942-NEXT:    s_mul_i32 s14, s1, s3
+; GFX942-NEXT:    s_add_i32 s5, s15, s5
+; GFX942-NEXT:    s_add_i32 s5, s5, s14
+; GFX942-NEXT:    s_mul_i32 s16, s0, s3
+; GFX942-NEXT:    s_mul_hi_u32 s14, s3, s5
+; GFX942-NEXT:    s_mul_i32 s15, s3, s5
+; GFX942-NEXT:    s_mul_hi_u32 s3, s3, s16
+; GFX942-NEXT:    s_add_u32 s3, s3, s15
+; GFX942-NEXT:    s_addc_u32 s14, 0, s14
+; GFX942-NEXT:    s_mul_hi_u32 s17, s2, s16
+; GFX942-NEXT:    s_mul_i32 s16, s2, s16
+; GFX942-NEXT:    s_add_u32 s3, s3, s16
+; GFX942-NEXT:    s_mul_hi_u32 s15, s2, s5
+; GFX942-NEXT:    s_addc_u32 s3, s14, s17
+; GFX942-NEXT:    s_addc_u32 s14, s15, 0
+; GFX942-NEXT:    s_mul_i32 s5, s2, s5
+; GFX942-NEXT:    s_add_u32 s3, s3, s5
+; GFX942-NEXT:    s_addc_u32 s5, 0, s14
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, s3, v2
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s2, s5
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v2
+; GFX942-NEXT:    s_mul_i32 s3, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s14, s0, s5
+; GFX942-NEXT:    s_add_i32 s3, s14, s3
+; GFX942-NEXT:    s_mul_i32 s1, s1, s5
+; GFX942-NEXT:    s_add_i32 s3, s3, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s5
+; GFX942-NEXT:    s_mul_hi_u32 s14, s2, s0
+; GFX942-NEXT:    s_mul_i32 s15, s2, s0
+; GFX942-NEXT:    s_mul_i32 s17, s5, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s5, s0
+; GFX942-NEXT:    s_mul_hi_u32 s16, s5, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s17
+; GFX942-NEXT:    s_addc_u32 s5, 0, s16
+; GFX942-NEXT:    s_add_u32 s0, s0, s15
+; GFX942-NEXT:    s_mul_hi_u32 s1, s2, s3
+; GFX942-NEXT:    s_addc_u32 s0, s5, s14
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s3, s2, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s3
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, s0, v2
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s2, s1
+; GFX942-NEXT:    s_ashr_i32 s14, s7, 31
+; GFX942-NEXT:    s_add_u32 s0, s6, s14
+; GFX942-NEXT:    s_mov_b32 s15, s14
+; GFX942-NEXT:    s_addc_u32 s1, s7, s14
+; GFX942-NEXT:    s_xor_b64 s[16:17], s[0:1], s[14:15]
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX942-NEXT:    s_mul_i32 s1, s16, s2
+; GFX942-NEXT:    s_mul_hi_u32 s5, s16, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s16, s2
+; GFX942-NEXT:    s_add_u32 s1, s5, s1
+; GFX942-NEXT:    s_addc_u32 s0, 0, s0
+; GFX942-NEXT:    s_mul_hi_u32 s7, s17, s3
+; GFX942-NEXT:    s_mul_i32 s3, s17, s3
+; GFX942-NEXT:    s_add_u32 s1, s1, s3
+; GFX942-NEXT:    s_mul_hi_u32 s5, s17, s2
+; GFX942-NEXT:    s_addc_u32 s0, s0, s7
+; GFX942-NEXT:    s_addc_u32 s1, s5, 0
+; GFX942-NEXT:    s_mul_i32 s2, s17, s2
+; GFX942-NEXT:    s_add_u32 s0, s0, s2
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    s_mul_i32 s1, s12, s1
+; GFX942-NEXT:    s_mul_hi_u32 s2, s12, s0
+; GFX942-NEXT:    s_add_i32 s1, s2, s1
+; GFX942-NEXT:    s_mul_i32 s2, s13, s0
+; GFX942-NEXT:    s_mul_i32 s0, s12, s0
+; GFX942-NEXT:    s_add_i32 s5, s1, s2
+; GFX942-NEXT:    v_mov_b32_e32 v2, s0
+; GFX942-NEXT:    s_sub_i32 s1, s17, s5
+; GFX942-NEXT:    v_sub_co_u32_e32 v2, vcc, s16, v2
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s7, s1, s13
+; GFX942-NEXT:    v_subrev_co_u32_e64 v3, s[0:1], s12, v2
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s15, s7, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s15, s13
+; GFX942-NEXT:    s_cselect_b32 s16, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v3
+; GFX942-NEXT:    s_cmp_eq_u32 s15, s13
+; GFX942-NEXT:    v_mov_b32_e32 v5, s16
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, 0, -1, s[2:3]
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, v5, v4, s[2:3]
+; GFX942-NEXT:    s_subb_u32 s2, s7, s13
+; GFX942-NEXT:    v_subrev_co_u32_e64 v5, s[0:1], s12, v3
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s2, s2, 0
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v4, s15
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v3, v5, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v5, s2
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s17, s5
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s13
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s12, v2
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s13
+; GFX942-NEXT:    v_mov_b32_e32 v6, s1
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v6, v5, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX942-NEXT:    v_mov_b32_e32 v6, s0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v2, s14, v2
+; GFX942-NEXT:    v_xor_b32_e32 v3, s14, v4
+; GFX942-NEXT:    v_mov_b32_e32 v4, s14
+; GFX942-NEXT:    v_subrev_co_u32_e32 v2, vcc, s14, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX942-NEXT:    s_cbranch_execnz .LBB10_6
+; GFX942-NEXT:  .LBB10_5:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, s4
+; GFX942-NEXT:    s_sub_i32 s0, 0, s4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_mul_lo_u32 v3, s0, v2
+; GFX942-NEXT:    v_mul_hi_u32 v3, v2, v3
+; GFX942-NEXT:    v_add_u32_e32 v2, v2, v3
+; GFX942-NEXT:    v_mul_hi_u32 v2, s6, v2
+; GFX942-NEXT:    v_mul_lo_u32 v2, v2, s4
+; GFX942-NEXT:    v_sub_u32_e32 v2, s6, v2
+; GFX942-NEXT:    v_subrev_u32_e32 v3, s4, v2
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX942-NEXT:    v_subrev_u32_e32 v3, s4, v2
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s4, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:  .LBB10_6:
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[8:9]
+; GFX942-NEXT:    s_endpgm
+; GFX942-NEXT:  .LBB10_7:
+; GFX942-NEXT:    ; implicit-def: $vgpr0_vgpr1
+; GFX942-NEXT:    s_branch .LBB10_2
+; GFX942-NEXT:  .LBB10_8:
+; GFX942-NEXT:    s_branch .LBB10_5
+;
 ; TAHITI-LABEL: srem_v2i64:
 ; TAHITI:       ; %bb.0:
 ; TAHITI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
@@ -4755,6 +5660,30 @@ define amdgpu_kernel void @srem_v2i64_4(ptr addrspace(1) %out, ptr addrspace(1)
 ; GCN-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: srem_v2i64_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v5, s[2:3]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v4, 30, v4
+; GFX942-NEXT:    v_ashrrev_i32_e32 v8, 31, v3
+; GFX942-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, v[4:5]
+; GFX942-NEXT:    v_lshrrev_b32_e32 v4, 30, v8
+; GFX942-NEXT:    v_and_b32_e32 v6, -4, v6
+; GFX942-NEXT:    v_lshl_add_u64 v[8:9], v[2:3], 0, v[4:5]
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v6
+; GFX942-NEXT:    v_and_b32_e32 v4, -4, v8
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v7, vcc
+; GFX942-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v9, vcc
+; GFX942-NEXT:    global_store_dwordx4 v5, v[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; TAHITI-LABEL: srem_v2i64_4:
 ; TAHITI:       ; %bb.0:
 ; TAHITI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
@@ -5484,6 +6413,693 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:  .LBB12_16:
 ; GCN-NEXT:    s_branch .LBB12_11
 ;
+; GFX942-LABEL: srem_v4i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx4 v[10:13], v8, s[10:11] offset:32
+; GFX942-NEXT:    global_load_dwordx4 v[14:17], v8, s[10:11]
+; GFX942-NEXT:    global_load_dwordx4 v[0:3], v8, s[10:11] offset:48
+; GFX942-NEXT:    global_load_dwordx4 v[4:7], v8, s[10:11] offset:16
+; GFX942-NEXT:    s_waitcnt vmcnt(3)
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v11
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v10
+; GFX942-NEXT:    s_waitcnt vmcnt(2)
+; GFX942-NEXT:    v_readfirstlane_b32 s7, v15
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v14
+; GFX942-NEXT:    s_or_b64 s[0:1], s[6:7], s[4:5]
+; GFX942-NEXT:    s_mov_b32 s0, 0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_cbranch_scc0 .LBB12_13
+; GFX942-NEXT:  ; %bb.1:
+; GFX942-NEXT:    s_ashr_i32 s0, s5, 31
+; GFX942-NEXT:    s_add_u32 s2, s4, s0
+; GFX942-NEXT:    s_mov_b32 s1, s0
+; GFX942-NEXT:    s_addc_u32 s3, s5, s0
+; GFX942-NEXT:    s_xor_b64 s[12:13], s[2:3], s[0:1]
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v8, s12
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v9, s13
+; GFX942-NEXT:    s_sub_u32 s0, 0, s12
+; GFX942-NEXT:    s_subb_u32 s1, 0, s13
+; GFX942-NEXT:    v_fmamk_f32 v8, v9, 0x4f800000, v8
+; GFX942-NEXT:    v_rcp_f32_e32 v8, v8
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v8, 0x5f7ffffc, v8
+; GFX942-NEXT:    v_mul_f32_e32 v9, 0x2f800000, v8
+; GFX942-NEXT:    v_trunc_f32_e32 v9, v9
+; GFX942-NEXT:    v_fmamk_f32 v8, v9, 0xcf800000, v8
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v9, v9
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v9
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v8
+; GFX942-NEXT:    s_mul_i32 s5, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s15, s0, s3
+; GFX942-NEXT:    s_mul_i32 s14, s1, s3
+; GFX942-NEXT:    s_add_i32 s5, s15, s5
+; GFX942-NEXT:    s_add_i32 s5, s5, s14
+; GFX942-NEXT:    s_mul_i32 s16, s0, s3
+; GFX942-NEXT:    s_mul_hi_u32 s14, s3, s5
+; GFX942-NEXT:    s_mul_i32 s15, s3, s5
+; GFX942-NEXT:    s_mul_hi_u32 s3, s3, s16
+; GFX942-NEXT:    s_add_u32 s3, s3, s15
+; GFX942-NEXT:    s_addc_u32 s14, 0, s14
+; GFX942-NEXT:    s_mul_hi_u32 s17, s2, s16
+; GFX942-NEXT:    s_mul_i32 s16, s2, s16
+; GFX942-NEXT:    s_add_u32 s3, s3, s16
+; GFX942-NEXT:    s_mul_hi_u32 s15, s2, s5
+; GFX942-NEXT:    s_addc_u32 s3, s14, s17
+; GFX942-NEXT:    s_addc_u32 s14, s15, 0
+; GFX942-NEXT:    s_mul_i32 s5, s2, s5
+; GFX942-NEXT:    s_add_u32 s3, s3, s5
+; GFX942-NEXT:    s_addc_u32 s5, 0, s14
+; GFX942-NEXT:    v_add_co_u32_e32 v8, vcc, s3, v8
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s2, s5
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v8
+; GFX942-NEXT:    s_mul_i32 s3, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s14, s0, s5
+; GFX942-NEXT:    s_add_i32 s3, s14, s3
+; GFX942-NEXT:    s_mul_i32 s1, s1, s5
+; GFX942-NEXT:    s_add_i32 s3, s3, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s5
+; GFX942-NEXT:    s_mul_hi_u32 s14, s2, s0
+; GFX942-NEXT:    s_mul_i32 s15, s2, s0
+; GFX942-NEXT:    s_mul_i32 s17, s5, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s5, s0
+; GFX942-NEXT:    s_mul_hi_u32 s16, s5, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s17
+; GFX942-NEXT:    s_addc_u32 s5, 0, s16
+; GFX942-NEXT:    s_add_u32 s0, s0, s15
+; GFX942-NEXT:    s_mul_hi_u32 s1, s2, s3
+; GFX942-NEXT:    s_addc_u32 s0, s5, s14
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s3, s2, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s3
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v8, vcc, s0, v8
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s2, s1
+; GFX942-NEXT:    s_ashr_i32 s14, s7, 31
+; GFX942-NEXT:    s_add_u32 s0, s6, s14
+; GFX942-NEXT:    s_mov_b32 s15, s14
+; GFX942-NEXT:    s_addc_u32 s1, s7, s14
+; GFX942-NEXT:    s_xor_b64 s[16:17], s[0:1], s[14:15]
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v8
+; GFX942-NEXT:    s_mul_i32 s1, s16, s2
+; GFX942-NEXT:    s_mul_hi_u32 s5, s16, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s16, s2
+; GFX942-NEXT:    s_add_u32 s1, s5, s1
+; GFX942-NEXT:    s_addc_u32 s0, 0, s0
+; GFX942-NEXT:    s_mul_hi_u32 s7, s17, s3
+; GFX942-NEXT:    s_mul_i32 s3, s17, s3
+; GFX942-NEXT:    s_add_u32 s1, s1, s3
+; GFX942-NEXT:    s_mul_hi_u32 s5, s17, s2
+; GFX942-NEXT:    s_addc_u32 s0, s0, s7
+; GFX942-NEXT:    s_addc_u32 s1, s5, 0
+; GFX942-NEXT:    s_mul_i32 s2, s17, s2
+; GFX942-NEXT:    s_add_u32 s0, s0, s2
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    s_mul_i32 s1, s12, s1
+; GFX942-NEXT:    s_mul_hi_u32 s2, s12, s0
+; GFX942-NEXT:    s_add_i32 s1, s2, s1
+; GFX942-NEXT:    s_mul_i32 s2, s13, s0
+; GFX942-NEXT:    s_mul_i32 s0, s12, s0
+; GFX942-NEXT:    s_add_i32 s5, s1, s2
+; GFX942-NEXT:    v_mov_b32_e32 v8, s0
+; GFX942-NEXT:    s_sub_i32 s1, s17, s5
+; GFX942-NEXT:    v_sub_co_u32_e32 v8, vcc, s16, v8
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s7, s1, s13
+; GFX942-NEXT:    v_subrev_co_u32_e64 v9, s[0:1], s12, v8
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s15, s7, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s15, s13
+; GFX942-NEXT:    s_cselect_b32 s16, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[2:3], s12, v9
+; GFX942-NEXT:    s_cmp_eq_u32 s15, s13
+; GFX942-NEXT:    v_mov_b32_e32 v11, s16
+; GFX942-NEXT:    v_cndmask_b32_e64 v10, 0, -1, s[2:3]
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v10, v11, v10, s[2:3]
+; GFX942-NEXT:    s_subb_u32 s2, s7, s13
+; GFX942-NEXT:    v_subrev_co_u32_e64 v11, s[0:1], s12, v9
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s2, s2, 0
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
+; GFX942-NEXT:    v_mov_b32_e32 v10, s15
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, v9, v11, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v11, s2
+; GFX942-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s17, s5
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s13
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s12, v8
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s13
+; GFX942-NEXT:    v_mov_b32_e32 v14, s1
+; GFX942-NEXT:    v_cndmask_b32_e64 v11, 0, -1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v11, v14, v11, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GFX942-NEXT:    v_mov_b32_e32 v14, s0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v8, v8, v9, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v10, v14, v10, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v8, s14, v8
+; GFX942-NEXT:    v_xor_b32_e32 v9, s14, v10
+; GFX942-NEXT:    v_mov_b32_e32 v10, s14
+; GFX942-NEXT:    v_subrev_co_u32_e32 v8, vcc, s14, v8
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v9, vcc, v9, v10, vcc
+; GFX942-NEXT:    s_cbranch_execnz .LBB12_3
+; GFX942-NEXT:  .LBB12_2:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v8, s4
+; GFX942-NEXT:    s_sub_i32 s0, 0, s4
+; GFX942-NEXT:    s_mov_b32 s1, 0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v8, v8
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v8, 0x4f7ffffe, v8
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v8, v8
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v8
+; GFX942-NEXT:    s_mul_i32 s0, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s0, s2, s0
+; GFX942-NEXT:    s_add_i32 s2, s2, s0
+; GFX942-NEXT:    s_mul_hi_u32 s0, s6, s2
+; GFX942-NEXT:    s_mul_i32 s0, s0, s4
+; GFX942-NEXT:    s_sub_i32 s0, s6, s0
+; GFX942-NEXT:    s_sub_i32 s2, s0, s4
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s4
+; GFX942-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX942-NEXT:    s_sub_i32 s2, s0, s4
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s4
+; GFX942-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX942-NEXT:    v_mov_b64_e32 v[8:9], s[0:1]
+; GFX942-NEXT:  .LBB12_3:
+; GFX942-NEXT:    v_or_b32_e32 v11, v17, v13
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-NEXT:    s_cbranch_vccz .LBB12_14
+; GFX942-NEXT:  ; %bb.4:
+; GFX942-NEXT:    v_ashrrev_i32_e32 v14, 31, v13
+; GFX942-NEXT:    v_mov_b32_e32 v15, v14
+; GFX942-NEXT:    v_lshl_add_u64 v[18:19], v[12:13], 0, v[14:15]
+; GFX942-NEXT:    v_xor_b32_e32 v13, v18, v14
+; GFX942-NEXT:    v_xor_b32_e32 v11, v19, v14
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v14, v13
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v15, v11
+; GFX942-NEXT:    v_sub_co_u32_e32 v22, vcc, 0, v13
+; GFX942-NEXT:    v_mov_b32_e32 v19, v10
+; GFX942-NEXT:    v_fmamk_f32 v14, v15, 0x4f800000, v14
+; GFX942-NEXT:    v_rcp_f32_e32 v14, v14
+; GFX942-NEXT:    v_subb_co_u32_e32 v23, vcc, 0, v11, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v14, 0x5f7ffffc, v14
+; GFX942-NEXT:    v_mul_f32_e32 v15, 0x2f800000, v14
+; GFX942-NEXT:    v_trunc_f32_e32 v15, v15
+; GFX942-NEXT:    v_fmamk_f32 v14, v15, 0xcf800000, v14
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v20, v15
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v21, v14
+; GFX942-NEXT:    v_mul_lo_u32 v18, v22, v20
+; GFX942-NEXT:    v_mul_hi_u32 v15, v22, v21
+; GFX942-NEXT:    v_mul_lo_u32 v14, v23, v21
+; GFX942-NEXT:    v_add_u32_e32 v15, v15, v18
+; GFX942-NEXT:    v_add_u32_e32 v24, v15, v14
+; GFX942-NEXT:    v_mul_lo_u32 v25, v22, v21
+; GFX942-NEXT:    v_mul_hi_u32 v15, v21, v24
+; GFX942-NEXT:    v_mul_lo_u32 v14, v21, v24
+; GFX942-NEXT:    v_mul_hi_u32 v18, v21, v25
+; GFX942-NEXT:    v_lshl_add_u64 v[14:15], v[18:19], 0, v[14:15]
+; GFX942-NEXT:    v_mul_hi_u32 v19, v20, v25
+; GFX942-NEXT:    v_mul_lo_u32 v25, v20, v25
+; GFX942-NEXT:    v_add_co_u32_e32 v14, vcc, v14, v25
+; GFX942-NEXT:    v_mul_hi_u32 v18, v20, v24
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v14, vcc, v15, v19, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v15, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v19, vcc, 0, v18, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v18, v20, v24
+; GFX942-NEXT:    v_lshl_add_u64 v[14:15], v[14:15], 0, v[18:19]
+; GFX942-NEXT:    v_add_co_u32_e32 v21, vcc, v21, v14
+; GFX942-NEXT:    v_mul_lo_u32 v18, v22, v21
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v20, vcc, v20, v15, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v14, v22, v20
+; GFX942-NEXT:    v_mul_hi_u32 v15, v22, v21
+; GFX942-NEXT:    v_add_u32_e32 v14, v15, v14
+; GFX942-NEXT:    v_mul_lo_u32 v15, v23, v21
+; GFX942-NEXT:    v_add_u32_e32 v23, v14, v15
+; GFX942-NEXT:    v_mul_hi_u32 v22, v20, v18
+; GFX942-NEXT:    v_mul_lo_u32 v25, v20, v18
+; GFX942-NEXT:    v_mul_hi_u32 v15, v21, v23
+; GFX942-NEXT:    v_mul_lo_u32 v14, v21, v23
+; GFX942-NEXT:    v_mul_hi_u32 v18, v21, v18
+; GFX942-NEXT:    v_mov_b32_e32 v19, v10
+; GFX942-NEXT:    v_lshl_add_u64 v[14:15], v[18:19], 0, v[14:15]
+; GFX942-NEXT:    v_add_co_u32_e32 v14, vcc, v14, v25
+; GFX942-NEXT:    v_mul_hi_u32 v24, v20, v23
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v14, vcc, v15, v22, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v18, v20, v23
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v19, vcc, 0, v24, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v15, v10
+; GFX942-NEXT:    v_lshl_add_u64 v[14:15], v[14:15], 0, v[18:19]
+; GFX942-NEXT:    v_add_co_u32_e32 v22, vcc, v21, v14
+; GFX942-NEXT:    v_ashrrev_i32_e32 v14, 31, v17
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v23, vcc, v20, v15, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v15, v14
+; GFX942-NEXT:    v_lshl_add_u64 v[18:19], v[16:17], 0, v[14:15]
+; GFX942-NEXT:    v_xor_b32_e32 v15, v18, v14
+; GFX942-NEXT:    v_xor_b32_e32 v17, v19, v14
+; GFX942-NEXT:    v_mul_hi_u32 v19, v15, v23
+; GFX942-NEXT:    v_mul_lo_u32 v18, v15, v23
+; GFX942-NEXT:    v_mul_hi_u32 v20, v15, v22
+; GFX942-NEXT:    v_mov_b32_e32 v21, v10
+; GFX942-NEXT:    v_lshl_add_u64 v[18:19], v[20:21], 0, v[18:19]
+; GFX942-NEXT:    v_mul_hi_u32 v21, v17, v22
+; GFX942-NEXT:    v_mul_lo_u32 v22, v17, v22
+; GFX942-NEXT:    v_add_co_u32_e32 v18, vcc, v18, v22
+; GFX942-NEXT:    v_mul_hi_u32 v20, v17, v23
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v18, vcc, v19, v21, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v19, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v21, vcc, 0, v20, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v20, v17, v23
+; GFX942-NEXT:    v_lshl_add_u64 v[18:19], v[18:19], 0, v[20:21]
+; GFX942-NEXT:    v_mul_lo_u32 v10, v13, v19
+; GFX942-NEXT:    v_mul_hi_u32 v19, v13, v18
+; GFX942-NEXT:    v_add_u32_e32 v10, v19, v10
+; GFX942-NEXT:    v_mul_lo_u32 v19, v11, v18
+; GFX942-NEXT:    v_add_u32_e32 v10, v10, v19
+; GFX942-NEXT:    v_mul_lo_u32 v18, v13, v18
+; GFX942-NEXT:    v_sub_u32_e32 v19, v17, v10
+; GFX942-NEXT:    v_sub_co_u32_e32 v15, vcc, v15, v18
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v18, s[0:1], v19, v11, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v19, s[0:1], v15, v13
+; GFX942-NEXT:    v_subb_co_u32_e32 v10, vcc, v17, v10, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v20, s[2:3], 0, v18, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v20, v11
+; GFX942-NEXT:    v_subb_co_u32_e64 v18, s[0:1], v18, v11, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v21, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v19, v13
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v11
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v22, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[2:3], v20, v11
+; GFX942-NEXT:    v_cndmask_b32_e64 v17, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v15, v13
+; GFX942-NEXT:    v_cndmask_b32_e64 v21, v21, v22, s[2:3]
+; GFX942-NEXT:    v_sub_co_u32_e64 v22, s[0:1], v19, v13
+; GFX942-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v18, s[0:1], 0, v18, s[0:1]
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v10, v11
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v21
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v11, v17, v13, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v19, v19, v22, s[0:1]
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v11
+; GFX942-NEXT:    v_cndmask_b32_e64 v18, v20, v18, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v11, v15, v19, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v10, v10, v18, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v11, v11, v14
+; GFX942-NEXT:    v_xor_b32_e32 v13, v10, v14
+; GFX942-NEXT:    v_sub_co_u32_e32 v10, vcc, v11, v14
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v11, vcc, v13, v14, vcc
+; GFX942-NEXT:    s_cbranch_execnz .LBB12_6
+; GFX942-NEXT:  .LBB12_5:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v10, v12
+; GFX942-NEXT:    v_sub_u32_e32 v11, 0, v12
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v10, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v10, 0x4f7ffffe, v10
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v10, v10
+; GFX942-NEXT:    v_mul_lo_u32 v11, v11, v10
+; GFX942-NEXT:    v_mul_hi_u32 v11, v10, v11
+; GFX942-NEXT:    v_add_u32_e32 v10, v10, v11
+; GFX942-NEXT:    v_mul_hi_u32 v10, v16, v10
+; GFX942-NEXT:    v_mul_lo_u32 v10, v10, v12
+; GFX942-NEXT:    v_sub_u32_e32 v10, v16, v10
+; GFX942-NEXT:    v_sub_u32_e32 v11, v10, v12
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v12
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
+; GFX942-NEXT:    v_sub_u32_e32 v11, v10, v12
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v10, v12
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v10, v10, v11, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-NEXT:  .LBB12_6:
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_or_b32_e32 v13, v5, v1
+; GFX942-NEXT:    v_mov_b32_e32 v12, 0
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[12:13]
+; GFX942-NEXT:    s_cbranch_vccz .LBB12_15
+; GFX942-NEXT:  ; %bb.7:
+; GFX942-NEXT:    v_ashrrev_i32_e32 v14, 31, v1
+; GFX942-NEXT:    v_mov_b32_e32 v15, v14
+; GFX942-NEXT:    v_lshl_add_u64 v[16:17], v[0:1], 0, v[14:15]
+; GFX942-NEXT:    v_xor_b32_e32 v13, v16, v14
+; GFX942-NEXT:    v_xor_b32_e32 v1, v17, v14
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v14, v13
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v15, v1
+; GFX942-NEXT:    v_sub_co_u32_e32 v20, vcc, 0, v13
+; GFX942-NEXT:    v_mov_b32_e32 v17, v12
+; GFX942-NEXT:    v_fmamk_f32 v14, v15, 0x4f800000, v14
+; GFX942-NEXT:    v_rcp_f32_e32 v14, v14
+; GFX942-NEXT:    v_subb_co_u32_e32 v21, vcc, 0, v1, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v14, 0x5f7ffffc, v14
+; GFX942-NEXT:    v_mul_f32_e32 v15, 0x2f800000, v14
+; GFX942-NEXT:    v_trunc_f32_e32 v15, v15
+; GFX942-NEXT:    v_fmamk_f32 v14, v15, 0xcf800000, v14
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v18, v15
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v19, v14
+; GFX942-NEXT:    v_mul_lo_u32 v16, v20, v18
+; GFX942-NEXT:    v_mul_hi_u32 v15, v20, v19
+; GFX942-NEXT:    v_mul_lo_u32 v14, v21, v19
+; GFX942-NEXT:    v_add_u32_e32 v15, v15, v16
+; GFX942-NEXT:    v_add_u32_e32 v22, v15, v14
+; GFX942-NEXT:    v_mul_lo_u32 v23, v20, v19
+; GFX942-NEXT:    v_mul_hi_u32 v15, v19, v22
+; GFX942-NEXT:    v_mul_lo_u32 v14, v19, v22
+; GFX942-NEXT:    v_mul_hi_u32 v16, v19, v23
+; GFX942-NEXT:    v_lshl_add_u64 v[14:15], v[16:17], 0, v[14:15]
+; GFX942-NEXT:    v_mul_hi_u32 v17, v18, v23
+; GFX942-NEXT:    v_mul_lo_u32 v23, v18, v23
+; GFX942-NEXT:    v_add_co_u32_e32 v14, vcc, v14, v23
+; GFX942-NEXT:    v_mul_hi_u32 v16, v18, v22
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v14, vcc, v15, v17, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v15, v12
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v17, vcc, 0, v16, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v16, v18, v22
+; GFX942-NEXT:    v_lshl_add_u64 v[14:15], v[14:15], 0, v[16:17]
+; GFX942-NEXT:    v_add_co_u32_e32 v19, vcc, v19, v14
+; GFX942-NEXT:    v_mul_lo_u32 v16, v20, v19
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v18, vcc, v18, v15, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v14, v20, v18
+; GFX942-NEXT:    v_mul_hi_u32 v15, v20, v19
+; GFX942-NEXT:    v_add_u32_e32 v14, v15, v14
+; GFX942-NEXT:    v_mul_lo_u32 v15, v21, v19
+; GFX942-NEXT:    v_add_u32_e32 v21, v14, v15
+; GFX942-NEXT:    v_mul_hi_u32 v20, v18, v16
+; GFX942-NEXT:    v_mul_lo_u32 v23, v18, v16
+; GFX942-NEXT:    v_mul_hi_u32 v15, v19, v21
+; GFX942-NEXT:    v_mul_lo_u32 v14, v19, v21
+; GFX942-NEXT:    v_mul_hi_u32 v16, v19, v16
+; GFX942-NEXT:    v_mov_b32_e32 v17, v12
+; GFX942-NEXT:    v_lshl_add_u64 v[14:15], v[16:17], 0, v[14:15]
+; GFX942-NEXT:    v_add_co_u32_e32 v14, vcc, v14, v23
+; GFX942-NEXT:    v_mul_hi_u32 v22, v18, v21
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v14, vcc, v15, v20, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v16, v18, v21
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v17, vcc, 0, v22, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v15, v12
+; GFX942-NEXT:    v_lshl_add_u64 v[14:15], v[14:15], 0, v[16:17]
+; GFX942-NEXT:    v_add_co_u32_e32 v20, vcc, v19, v14
+; GFX942-NEXT:    v_ashrrev_i32_e32 v14, 31, v5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v21, vcc, v18, v15, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v15, v14
+; GFX942-NEXT:    v_lshl_add_u64 v[16:17], v[4:5], 0, v[14:15]
+; GFX942-NEXT:    v_xor_b32_e32 v5, v16, v14
+; GFX942-NEXT:    v_xor_b32_e32 v15, v17, v14
+; GFX942-NEXT:    v_mul_hi_u32 v17, v5, v21
+; GFX942-NEXT:    v_mul_lo_u32 v16, v5, v21
+; GFX942-NEXT:    v_mul_hi_u32 v18, v5, v20
+; GFX942-NEXT:    v_mov_b32_e32 v19, v12
+; GFX942-NEXT:    v_lshl_add_u64 v[16:17], v[18:19], 0, v[16:17]
+; GFX942-NEXT:    v_mul_hi_u32 v19, v15, v20
+; GFX942-NEXT:    v_mul_lo_u32 v20, v15, v20
+; GFX942-NEXT:    v_add_co_u32_e32 v16, vcc, v16, v20
+; GFX942-NEXT:    v_mul_hi_u32 v18, v15, v21
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v16, vcc, v17, v19, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v17, v12
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v19, vcc, 0, v18, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v18, v15, v21
+; GFX942-NEXT:    v_lshl_add_u64 v[16:17], v[16:17], 0, v[18:19]
+; GFX942-NEXT:    v_mul_lo_u32 v12, v13, v17
+; GFX942-NEXT:    v_mul_hi_u32 v17, v13, v16
+; GFX942-NEXT:    v_add_u32_e32 v12, v17, v12
+; GFX942-NEXT:    v_mul_lo_u32 v17, v1, v16
+; GFX942-NEXT:    v_add_u32_e32 v12, v12, v17
+; GFX942-NEXT:    v_mul_lo_u32 v16, v13, v16
+; GFX942-NEXT:    v_sub_u32_e32 v17, v15, v12
+; GFX942-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v16
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v16, s[0:1], v17, v1, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v17, s[0:1], v5, v13
+; GFX942-NEXT:    v_subb_co_u32_e32 v12, vcc, v15, v12, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v18, s[2:3], 0, v16, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v18, v1
+; GFX942-NEXT:    v_subb_co_u32_e64 v16, s[0:1], v16, v1, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v19, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v17, v13
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v12, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v20, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[2:3], v18, v1
+; GFX942-NEXT:    v_cndmask_b32_e64 v15, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v13
+; GFX942-NEXT:    v_cndmask_b32_e64 v19, v19, v20, s[2:3]
+; GFX942-NEXT:    v_sub_co_u32_e64 v20, s[0:1], v17, v13
+; GFX942-NEXT:    v_cndmask_b32_e64 v13, 0, -1, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v16, s[0:1], 0, v16, s[0:1]
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v12, v1
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v19
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v15, v13, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v17, v17, v20, s[0:1]
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX942-NEXT:    v_cndmask_b32_e64 v16, v18, v16, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v5, v17, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v12, v16, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v5, v5, v14
+; GFX942-NEXT:    v_xor_b32_e32 v1, v1, v14
+; GFX942-NEXT:    v_sub_co_u32_e32 v12, vcc, v5, v14
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v13, vcc, v1, v14, vcc
+; GFX942-NEXT:    s_cbranch_execnz .LBB12_9
+; GFX942-NEXT:  .LBB12_8:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v0
+; GFX942-NEXT:    v_sub_u32_e32 v5, 0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v13, 0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_mul_lo_u32 v5, v5, v1
+; GFX942-NEXT:    v_mul_hi_u32 v5, v1, v5
+; GFX942-NEXT:    v_add_u32_e32 v1, v1, v5
+; GFX942-NEXT:    v_mul_hi_u32 v1, v4, v1
+; GFX942-NEXT:    v_mul_lo_u32 v1, v1, v0
+; GFX942-NEXT:    v_sub_u32_e32 v1, v4, v1
+; GFX942-NEXT:    v_sub_u32_e32 v4, v1, v0
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX942-NEXT:    v_sub_u32_e32 v4, v1, v0
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v12, v1, v4, vcc
+; GFX942-NEXT:  .LBB12_9:
+; GFX942-NEXT:    v_or_b32_e32 v1, v7, v3
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GFX942-NEXT:    s_cbranch_vccz .LBB12_16
+; GFX942-NEXT:  ; %bb.10:
+; GFX942-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX942-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-NEXT:    v_lshl_add_u64 v[14:15], v[2:3], 0, v[4:5]
+; GFX942-NEXT:    v_xor_b32_e32 v3, v14, v4
+; GFX942-NEXT:    v_xor_b32_e32 v1, v15, v4
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v4, v3
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v5, v1
+; GFX942-NEXT:    v_sub_co_u32_e32 v18, vcc, 0, v3
+; GFX942-NEXT:    v_mov_b32_e32 v15, v0
+; GFX942-NEXT:    v_fmamk_f32 v4, v5, 0x4f800000, v4
+; GFX942-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX942-NEXT:    v_subb_co_u32_e32 v19, vcc, 0, v1, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GFX942-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; GFX942-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-NEXT:    v_fmamk_f32 v4, v5, 0xcf800000, v4
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v16, v5
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v17, v4
+; GFX942-NEXT:    v_mul_lo_u32 v14, v18, v16
+; GFX942-NEXT:    v_mul_hi_u32 v5, v18, v17
+; GFX942-NEXT:    v_mul_lo_u32 v4, v19, v17
+; GFX942-NEXT:    v_add_u32_e32 v5, v5, v14
+; GFX942-NEXT:    v_add_u32_e32 v20, v5, v4
+; GFX942-NEXT:    v_mul_lo_u32 v21, v18, v17
+; GFX942-NEXT:    v_mul_hi_u32 v5, v17, v20
+; GFX942-NEXT:    v_mul_lo_u32 v4, v17, v20
+; GFX942-NEXT:    v_mul_hi_u32 v14, v17, v21
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[14:15], 0, v[4:5]
+; GFX942-NEXT:    v_mul_hi_u32 v15, v16, v21
+; GFX942-NEXT:    v_mul_lo_u32 v21, v16, v21
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v21
+; GFX942-NEXT:    v_mul_hi_u32 v14, v16, v20
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v15, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v14, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v14, v16, v20
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[14:15]
+; GFX942-NEXT:    v_add_co_u32_e32 v17, vcc, v17, v4
+; GFX942-NEXT:    v_mul_lo_u32 v14, v18, v17
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v16, vcc, v16, v5, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v4, v18, v16
+; GFX942-NEXT:    v_mul_hi_u32 v5, v18, v17
+; GFX942-NEXT:    v_add_u32_e32 v4, v5, v4
+; GFX942-NEXT:    v_mul_lo_u32 v5, v19, v17
+; GFX942-NEXT:    v_add_u32_e32 v19, v4, v5
+; GFX942-NEXT:    v_mul_hi_u32 v18, v16, v14
+; GFX942-NEXT:    v_mul_lo_u32 v21, v16, v14
+; GFX942-NEXT:    v_mul_hi_u32 v5, v17, v19
+; GFX942-NEXT:    v_mul_lo_u32 v4, v17, v19
+; GFX942-NEXT:    v_mul_hi_u32 v14, v17, v14
+; GFX942-NEXT:    v_mov_b32_e32 v15, v0
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[14:15], 0, v[4:5]
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v21
+; GFX942-NEXT:    v_mul_hi_u32 v20, v16, v19
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v5, v18, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v14, v16, v19
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v15, vcc, 0, v20, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, v0
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[4:5], 0, v[14:15]
+; GFX942-NEXT:    v_add_co_u32_e32 v18, vcc, v17, v4
+; GFX942-NEXT:    v_ashrrev_i32_e32 v4, 31, v7
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v19, vcc, v16, v5, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-NEXT:    v_lshl_add_u64 v[14:15], v[6:7], 0, v[4:5]
+; GFX942-NEXT:    v_xor_b32_e32 v5, v14, v4
+; GFX942-NEXT:    v_xor_b32_e32 v7, v15, v4
+; GFX942-NEXT:    v_mul_hi_u32 v15, v5, v19
+; GFX942-NEXT:    v_mul_lo_u32 v14, v5, v19
+; GFX942-NEXT:    v_mul_hi_u32 v16, v5, v18
+; GFX942-NEXT:    v_mov_b32_e32 v17, v0
+; GFX942-NEXT:    v_lshl_add_u64 v[14:15], v[16:17], 0, v[14:15]
+; GFX942-NEXT:    v_mul_hi_u32 v17, v7, v18
+; GFX942-NEXT:    v_mul_lo_u32 v18, v7, v18
+; GFX942-NEXT:    v_add_co_u32_e32 v14, vcc, v14, v18
+; GFX942-NEXT:    v_mul_hi_u32 v16, v7, v19
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v14, vcc, v15, v17, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v15, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v17, vcc, 0, v16, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v16, v7, v19
+; GFX942-NEXT:    v_lshl_add_u64 v[14:15], v[14:15], 0, v[16:17]
+; GFX942-NEXT:    v_mul_lo_u32 v0, v3, v15
+; GFX942-NEXT:    v_mul_hi_u32 v15, v3, v14
+; GFX942-NEXT:    v_add_u32_e32 v0, v15, v0
+; GFX942-NEXT:    v_mul_lo_u32 v15, v1, v14
+; GFX942-NEXT:    v_add_u32_e32 v0, v0, v15
+; GFX942-NEXT:    v_mul_lo_u32 v14, v3, v14
+; GFX942-NEXT:    v_sub_u32_e32 v15, v7, v0
+; GFX942-NEXT:    v_sub_co_u32_e32 v5, vcc, v5, v14
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v14, s[0:1], v15, v1, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v15, s[0:1], v5, v3
+; GFX942-NEXT:    v_subb_co_u32_e32 v0, vcc, v7, v0, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v16, s[2:3], 0, v14, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v16, v1
+; GFX942-NEXT:    v_subb_co_u32_e64 v14, s[0:1], v14, v1, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v17, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v15, v3
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v18, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[2:3], v16, v1
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v3
+; GFX942-NEXT:    v_cndmask_b32_e64 v17, v17, v18, s[2:3]
+; GFX942-NEXT:    v_sub_co_u32_e64 v18, s[0:1], v15, v3
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v14, s[0:1], 0, v14, s[0:1]
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v1
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v17
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v15, v15, v18, s[0:1]
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX942-NEXT:    v_cndmask_b32_e64 v14, v16, v14, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v15, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v14, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GFX942-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX942-NEXT:    v_sub_co_u32_e32 v14, vcc, v1, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v15, vcc, v0, v4, vcc
+; GFX942-NEXT:    s_cbranch_execnz .LBB12_12
+; GFX942-NEXT:  .LBB12_11:
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, v2
+; GFX942-NEXT:    v_sub_u32_e32 v1, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v15, 0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_mul_lo_u32 v1, v1, v0
+; GFX942-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX942-NEXT:    v_add_u32_e32 v0, v0, v1
+; GFX942-NEXT:    v_mul_hi_u32 v0, v6, v0
+; GFX942-NEXT:    v_mul_lo_u32 v0, v0, v2
+; GFX942-NEXT:    v_sub_u32_e32 v0, v6, v0
+; GFX942-NEXT:    v_sub_u32_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
+; GFX942-NEXT:    v_sub_u32_e32 v1, v0, v2
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v14, v0, v1, vcc
+; GFX942-NEXT:  .LBB12_12:
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    global_store_dwordx4 v0, v[12:15], s[8:9] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, v[8:11], s[8:9]
+; GFX942-NEXT:    s_endpgm
+; GFX942-NEXT:  .LBB12_13:
+; GFX942-NEXT:    ; implicit-def: $vgpr8_vgpr9
+; GFX942-NEXT:    s_branch .LBB12_2
+; GFX942-NEXT:  .LBB12_14:
+; GFX942-NEXT:    s_branch .LBB12_5
+; GFX942-NEXT:  .LBB12_15:
+; GFX942-NEXT:    ; implicit-def: $vgpr12_vgpr13
+; GFX942-NEXT:    s_branch .LBB12_8
+; GFX942-NEXT:  .LBB12_16:
+; GFX942-NEXT:    s_branch .LBB12_11
+;
 ; TAHITI-LABEL: srem_v4i64:
 ; TAHITI:       ; %bb.0:
 ; TAHITI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
@@ -8922,6 +10538,47 @@ define amdgpu_kernel void @srem_v4i64_4(ptr addrspace(1) %out, ptr addrspace(1)
 ; GCN-NEXT:    global_store_dwordx4 v8, v[0:3], s[0:1]
 ; GCN-NEXT:    s_endpgm
 ;
+; GFX942-LABEL: srem_v4i64_4:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx4 v[2:5], v1, s[2:3]
+; GFX942-NEXT:    global_load_dwordx4 v[6:9], v1, s[2:3] offset:16
+; GFX942-NEXT:    s_waitcnt vmcnt(1)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v0, 31, v3
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 30, v0
+; GFX942-NEXT:    v_ashrrev_i32_e32 v12, 31, v5
+; GFX942-NEXT:    v_lshl_add_u64 v[10:11], v[2:3], 0, v[0:1]
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 30, v12
+; GFX942-NEXT:    v_and_b32_e32 v10, -4, v10
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v14, 31, v7
+; GFX942-NEXT:    v_lshl_add_u64 v[12:13], v[4:5], 0, v[0:1]
+; GFX942-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v10
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 30, v14
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v11, vcc
+; GFX942-NEXT:    v_and_b32_e32 v12, -4, v12
+; GFX942-NEXT:    v_ashrrev_i32_e32 v15, 31, v9
+; GFX942-NEXT:    v_lshl_add_u64 v[10:11], v[6:7], 0, v[0:1]
+; GFX942-NEXT:    v_sub_co_u32_e32 v4, vcc, v4, v12
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 30, v15
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v13, vcc
+; GFX942-NEXT:    v_and_b32_e32 v10, -4, v10
+; GFX942-NEXT:    v_lshl_add_u64 v[12:13], v[8:9], 0, v[0:1]
+; GFX942-NEXT:    v_sub_co_u32_e32 v6, vcc, v6, v10
+; GFX942-NEXT:    v_and_b32_e32 v0, -4, v12
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subb_co_u32_e32 v7, vcc, v7, v11, vcc
+; GFX942-NEXT:    v_sub_co_u32_e32 v8, vcc, v8, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v9, vcc, v9, v13, vcc
+; GFX942-NEXT:    global_store_dwordx4 v1, v[6:9], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v1, v[2:5], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
 ; TAHITI-LABEL: srem_v4i64_4:
 ; TAHITI:       ; %bb.0:
 ; TAHITI-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x9
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index e64e3def98c26..2bc71e062f709 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GCN-IR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GFX942-IR %s
 
 define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_srem:
@@ -202,6 +204,219 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_srem:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GFX942-NEXT:    s_sub_u32 s0, 0, s6
+; GFX942-NEXT:    s_subb_u32 s1, 0, s7
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_fmamk_f32 v1, v2, 0xcf800000, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX942-NEXT:    s_mul_i32 s4, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s12, s0, s3
+; GFX942-NEXT:    s_mul_i32 s5, s1, s3
+; GFX942-NEXT:    s_add_i32 s4, s12, s4
+; GFX942-NEXT:    s_mul_i32 s13, s0, s3
+; GFX942-NEXT:    s_add_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s12, s3, s13
+; GFX942-NEXT:    s_mul_hi_u32 s5, s3, s4
+; GFX942-NEXT:    s_mul_i32 s3, s3, s4
+; GFX942-NEXT:    s_add_u32 s3, s12, s3
+; GFX942-NEXT:    s_addc_u32 s5, 0, s5
+; GFX942-NEXT:    s_mul_hi_u32 s12, s2, s13
+; GFX942-NEXT:    s_mul_i32 s13, s2, s13
+; GFX942-NEXT:    s_add_u32 s3, s3, s13
+; GFX942-NEXT:    s_mul_hi_u32 s14, s2, s4
+; GFX942-NEXT:    s_addc_u32 s3, s5, s12
+; GFX942-NEXT:    s_addc_u32 s5, s14, 0
+; GFX942-NEXT:    s_mul_i32 s4, s2, s4
+; GFX942-NEXT:    s_add_u32 s3, s3, s4
+; GFX942-NEXT:    s_addc_u32 s4, 0, s5
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s3, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s2, s4
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    s_mul_i32 s3, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s5, s0, s4
+; GFX942-NEXT:    s_add_i32 s3, s5, s3
+; GFX942-NEXT:    s_mul_i32 s1, s1, s4
+; GFX942-NEXT:    s_add_i32 s3, s3, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s4
+; GFX942-NEXT:    s_mul_hi_u32 s5, s2, s0
+; GFX942-NEXT:    s_mul_i32 s12, s2, s0
+; GFX942-NEXT:    s_mul_i32 s14, s4, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s4, s0
+; GFX942-NEXT:    s_mul_hi_u32 s13, s4, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s14
+; GFX942-NEXT:    s_addc_u32 s4, 0, s13
+; GFX942-NEXT:    s_add_u32 s0, s0, s12
+; GFX942-NEXT:    s_mul_hi_u32 s1, s2, s3
+; GFX942-NEXT:    s_addc_u32 s0, s4, s5
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s3, s2, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s3
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s0, s2, s1
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX942-NEXT:    s_mul_i32 s2, s10, s0
+; GFX942-NEXT:    s_mul_hi_u32 s4, s10, s3
+; GFX942-NEXT:    s_mul_hi_u32 s1, s10, s0
+; GFX942-NEXT:    s_add_u32 s2, s4, s2
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    s_mul_hi_u32 s5, s11, s3
+; GFX942-NEXT:    s_mul_i32 s3, s11, s3
+; GFX942-NEXT:    s_add_u32 s2, s2, s3
+; GFX942-NEXT:    s_mul_hi_u32 s4, s11, s0
+; GFX942-NEXT:    s_addc_u32 s1, s1, s5
+; GFX942-NEXT:    s_addc_u32 s2, s4, 0
+; GFX942-NEXT:    s_mul_i32 s0, s11, s0
+; GFX942-NEXT:    s_add_u32 s0, s1, s0
+; GFX942-NEXT:    s_addc_u32 s1, 0, s2
+; GFX942-NEXT:    s_mul_i32 s1, s6, s1
+; GFX942-NEXT:    s_mul_hi_u32 s2, s6, s0
+; GFX942-NEXT:    s_add_i32 s1, s2, s1
+; GFX942-NEXT:    s_mul_i32 s2, s7, s0
+; GFX942-NEXT:    s_mul_i32 s0, s6, s0
+; GFX942-NEXT:    s_add_i32 s4, s1, s2
+; GFX942-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NEXT:    s_sub_i32 s1, s11, s4
+; GFX942-NEXT:    v_sub_co_u32_e32 v1, vcc, s10, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s5, s1, s7
+; GFX942-NEXT:    v_subrev_co_u32_e64 v2, s[0:1], s6, v1
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s10, s5, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s10, s7
+; GFX942-NEXT:    s_cselect_b32 s12, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[2:3], s6, v2
+; GFX942-NEXT:    s_cmp_eq_u32 s10, s7
+; GFX942-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[2:3]
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[2:3]
+; GFX942-NEXT:    s_subb_u32 s2, s5, s7
+; GFX942-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s6, v2
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s2, s2, 0
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
+; GFX942-NEXT:    v_mov_b32_e32 v3, s10
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s11, s4
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s6, v1
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s7
+; GFX942-NEXT:    v_mov_b32_e32 v5, s1
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, s0
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_srem:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    s_mov_b32 s11, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[6:7], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX942-IR-NEXT:    s_flbit_i32_b64 s10, s[6:7]
+; GFX942-IR-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX942-IR-NEXT:    s_flbit_i32_b64 s18, s[2:3]
+; GFX942-IR-NEXT:    s_sub_u32 s12, s10, s18
+; GFX942-IR-NEXT:    s_subb_u32 s13, 0, 0
+; GFX942-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[12:13], 63
+; GFX942-IR-NEXT:    s_or_b64 s[14:15], s[8:9], s[14:15]
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[12:13], 63
+; GFX942-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
+; GFX942-IR-NEXT:    s_and_b64 s[8:9], s[14:15], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s9, 0, s3
+; GFX942-IR-NEXT:    s_cselect_b32 s8, 0, s2
+; GFX942-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB0_5
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    s_add_u32 s14, s12, 1
+; GFX942-IR-NEXT:    s_addc_u32 s15, s13, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[14:15], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX942-IR-NEXT:    s_sub_i32 s12, 63, s12
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; GFX942-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s12
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB0_4
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s14
+; GFX942-IR-NEXT:    s_add_u32 s16, s6, -1
+; GFX942-IR-NEXT:    s_addc_u32 s17, s7, -1
+; GFX942-IR-NEXT:    s_not_b64 s[4:5], s[10:11]
+; GFX942-IR-NEXT:    s_add_u32 s10, s4, s18
+; GFX942-IR-NEXT:    s_addc_u32 s11, s5, 0
+; GFX942-IR-NEXT:    s_mov_b64 s[14:15], 0
+; GFX942-IR-NEXT:    s_mov_b32 s5, 0
+; GFX942-IR-NEXT:  .LBB0_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GFX942-IR-NEXT:    s_lshr_b32 s4, s9, 31
+; GFX942-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
+; GFX942-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[4:5]
+; GFX942-IR-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
+; GFX942-IR-NEXT:    s_sub_u32 s4, s16, s12
+; GFX942-IR-NEXT:    s_subb_u32 s4, s17, s13
+; GFX942-IR-NEXT:    s_ashr_i32 s14, s4, 31
+; GFX942-IR-NEXT:    s_mov_b32 s15, s14
+; GFX942-IR-NEXT:    s_and_b32 s4, s14, 1
+; GFX942-IR-NEXT:    s_and_b64 s[14:15], s[14:15], s[6:7]
+; GFX942-IR-NEXT:    s_sub_u32 s12, s12, s14
+; GFX942-IR-NEXT:    s_subb_u32 s13, s13, s15
+; GFX942-IR-NEXT:    s_add_u32 s10, s10, 1
+; GFX942-IR-NEXT:    s_addc_u32 s11, s11, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX942-IR-NEXT:    s_mov_b64 s[14:15], s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_scc0 .LBB0_3
+; GFX942-IR-NEXT:  .LBB0_4: ; %Flow7
+; GFX942-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
+; GFX942-IR-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GFX942-IR-NEXT:  .LBB0_5: ; %udiv-end
+; GFX942-IR-NEXT:    s_mul_i32 s4, s6, s9
+; GFX942-IR-NEXT:    s_mul_hi_u32 s5, s6, s8
+; GFX942-IR-NEXT:    s_add_i32 s4, s5, s4
+; GFX942-IR-NEXT:    s_mul_i32 s5, s7, s8
+; GFX942-IR-NEXT:    s_add_i32 s4, s4, s5
+; GFX942-IR-NEXT:    s_mul_i32 s5, s6, s8
+; GFX942-IR-NEXT:    s_sub_u32 s2, s2, s5
+; GFX942-IR-NEXT:    s_subb_u32 s3, s3, s4
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-IR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %result = urem i64 %x, %y
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -434,6 +649,230 @@ define i64 @v_test_srem(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v14
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v15, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_srem:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v4, 31, v3
+; GFX942-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942-NEXT:    v_xor_b32_e32 v12, v3, v4
+; GFX942-NEXT:    v_xor_b32_e32 v13, v2, v4
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, v13
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, v12
+; GFX942-NEXT:    v_sub_co_u32_e32 v11, vcc, 0, v13
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-NEXT:    v_fmamk_f32 v2, v3, 0x4f800000, v2
+; GFX942-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX942-NEXT:    v_subb_co_u32_e32 v14, vcc, 0, v12, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX942-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fmamk_f32 v2, v3, 0xcf800000, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v10, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v15, v3
+; GFX942-NEXT:    v_mul_lo_u32 v4, v14, v10
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v11, v10, 0
+; GFX942-NEXT:    v_mul_lo_u32 v5, v11, v15
+; GFX942-NEXT:    v_add3_u32 v3, v3, v5, v4
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v10, v3, 0
+; GFX942-NEXT:    v_mul_hi_u32 v6, v10, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[6:7], 0, v[4:5]
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v15, v3, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v15, v2, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v6, vcc, v5, v3, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[6:7], 0, v[8:9]
+; GFX942-NEXT:    v_add_co_u32_e32 v16, vcc, v10, v2
+; GFX942-NEXT:    v_mul_lo_u32 v5, v14, v16
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v3, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v4, v11, v15
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v11, v16, 0
+; GFX942-NEXT:    v_add3_u32 v3, v3, v4, v5
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v16, v3, 0
+; GFX942-NEXT:    v_mul_hi_u32 v6, v16, v2
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v15, v3, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v15, v2, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[6:7], 0, v[10:11]
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v8
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v6, vcc, v3, v9, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[6:7], 0, v[4:5]
+; GFX942-NEXT:    v_add_co_u32_e32 v8, vcc, v16, v2
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v15, v3, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT:    v_xor_b32_e32 v10, v0, v2
+; GFX942-NEXT:    v_xor_b32_e32 v3, v1, v2
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v10, v4, 0
+; GFX942-NEXT:    v_mul_hi_u32 v6, v10, v8
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[6:7], 0, v[0:1]
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v3, v8, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v8
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v3, v4, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v6, vcc, v1, v9, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[6:7], 0, v[4:5]
+; GFX942-NEXT:    v_mul_lo_u32 v4, v12, v0
+; GFX942-NEXT:    v_mul_lo_u32 v5, v13, v1
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v13, v0, 0
+; GFX942-NEXT:    v_add3_u32 v1, v1, v5, v4
+; GFX942-NEXT:    v_sub_u32_e32 v4, v3, v1
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v10, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v12, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v5, s[0:1], v0, v13
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v4, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v6, v12
+; GFX942-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v12, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v13
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v12
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[2:3], v6, v12
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v13
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
+; GFX942-NEXT:    v_sub_co_u32_e64 v8, s[0:1], v5, v13
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, v6, v4, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v12
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v6, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v5, v8, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX942-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_srem:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v4, 31, v1
+; GFX942-IR-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX942-IR-NEXT:    v_xor_b32_e32 v1, v1, v4
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v4, vcc
+; GFX942-IR-NEXT:    v_xor_b32_e32 v2, v2, v6
+; GFX942-IR-NEXT:    v_xor_b32_e32 v3, v3, v6
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v2, vcc, v2, v6
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[0:1]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v6, v2
+; GFX942-IR-NEXT:    v_add_u32_e32 v6, 32, v6
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v7, v3
+; GFX942-IR-NEXT:    v_min_u32_e32 v14, v6, v7
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v6, v0
+; GFX942-IR-NEXT:    v_add_u32_e32 v6, 32, v6
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v7, v1
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX942-IR-NEXT:    v_min_u32_e32 v12, v6, v7
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v10, vcc, v14, v12
+; GFX942-IR-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e64 v11, s[2:3], 0, 0, vcc
+; GFX942-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[10:11]
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[10:11]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v13, 0
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v7, v1, 0, s[0:1]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v6, v0, 0, s[0:1]
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB1_6
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[16:17], v[10:11], 0, 1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v6, 63, v10
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[8:9], 0
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[6:7], v6, v[0:1]
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB1_5
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    v_mov_b32_e32 v9, -1
+; GFX942-IR-NEXT:    v_not_b32_e32 v8, v14
+; GFX942-IR-NEXT:    v_lshrrev_b64 v[16:17], v16, v[0:1]
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[10:11], v[2:3], 0, -1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[12:13], v[8:9], 0, v[12:13]
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[14:15], 0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-IR-NEXT:  .LBB1_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[16:17], 1, v[16:17]
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v8, 31, v7
+; GFX942-IR-NEXT:    v_or_b32_e32 v16, v16, v8
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v8, vcc, v10, v16
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v8, vcc, v11, v17, vcc
+; GFX942-IR-NEXT:    v_or_b32_e32 v6, v14, v6
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v14, 31, v8
+; GFX942-IR-NEXT:    v_or_b32_e32 v7, v15, v7
+; GFX942-IR-NEXT:    v_and_b32_e32 v8, 1, v14
+; GFX942-IR-NEXT:    v_and_b32_e32 v15, v14, v3
+; GFX942-IR-NEXT:    v_and_b32_e32 v14, v14, v2
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v16, vcc, v16, v14
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[12:13], v[12:13], 0, 1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v17, vcc, v17, v15, vcc
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[14:15], v[8:9]
+; GFX942-IR-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX942-IR-NEXT:  ; %bb.4: ; %Flow
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:  .LBB1_5: ; %Flow4
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[6:7], 1, v[6:7]
+; GFX942-IR-NEXT:    v_or_b32_e32 v7, v9, v7
+; GFX942-IR-NEXT:    v_or_b32_e32 v6, v8, v6
+; GFX942-IR-NEXT:  .LBB1_6: ; %Flow5
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-IR-NEXT:    v_mul_lo_u32 v8, v3, v6
+; GFX942-IR-NEXT:    v_mul_lo_u32 v7, v2, v7
+; GFX942-IR-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v2, v6, 0
+; GFX942-IR-NEXT:    v_add3_u32 v3, v3, v7, v8
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-IR-NEXT:    v_xor_b32_e32 v0, v0, v4
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-IR-NEXT:    v_xor_b32_e32 v1, v1, v5
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %x, %y
   ret i64 %result
 }
@@ -510,6 +949,70 @@ define amdgpu_kernel void @s_test_srem23_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_srem23_64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s4, s7, 9
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX942-NEXT:    s_ashr_i32 s5, s3, 9
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, s5
+; GFX942-NEXT:    s_xor_b32 s2, s5, s4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    s_or_b32 s6, s2, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s6, 0
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX942-NEXT:    s_add_i32 s2, s3, s2
+; GFX942-NEXT:    s_mul_i32 s2, s2, s4
+; GFX942-NEXT:    s_sub_i32 s2, s5, s2
+; GFX942-NEXT:    s_bfe_i32 s2, s2, 0x170000
+; GFX942-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_srem23_64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s4, s7, 9
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX942-IR-NEXT:    s_ashr_i32 s5, s3, 9
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v2, s5
+; GFX942-IR-NEXT:    s_xor_b32 s2, s5, s4
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-IR-NEXT:    s_or_b32 s6, s2, 1
+; GFX942-IR-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-IR-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s6, 0
+; GFX942-IR-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX942-IR-NEXT:    s_add_i32 s2, s3, s2
+; GFX942-IR-NEXT:    s_mul_i32 s2, s2, s4
+; GFX942-IR-NEXT:    s_sub_i32 s2, s5, s2
+; GFX942-IR-NEXT:    s_bfe_i32 s2, s2, 0x170000
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 41
   %2 = ashr i64 %y, 41
   %result = srem i64 %1, %2
@@ -589,6 +1092,70 @@ define amdgpu_kernel void @s_test_srem24_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_srem24_64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s4, s7, 8
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX942-NEXT:    s_ashr_i32 s5, s3, 8
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, s5
+; GFX942-NEXT:    s_xor_b32 s2, s5, s4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-NEXT:    s_or_b32 s6, s2, 1
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s6, 0
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX942-NEXT:    s_add_i32 s2, s3, s2
+; GFX942-NEXT:    s_mul_i32 s2, s2, s4
+; GFX942-NEXT:    s_sub_i32 s2, s5, s2
+; GFX942-NEXT:    s_bfe_i32 s2, s2, 0x180000
+; GFX942-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_srem24_64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s4, s7, 8
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX942-IR-NEXT:    s_ashr_i32 s5, s3, 8
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v2, s5
+; GFX942-IR-NEXT:    s_xor_b32 s2, s5, s4
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s2, 30
+; GFX942-IR-NEXT:    s_or_b32 s6, s2, 1
+; GFX942-IR-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-IR-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v3, v3
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v2|, |v0|
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s6, 0
+; GFX942-IR-NEXT:    v_readfirstlane_b32 s3, v3
+; GFX942-IR-NEXT:    s_add_i32 s2, s3, s2
+; GFX942-IR-NEXT:    s_mul_i32 s2, s2, s4
+; GFX942-IR-NEXT:    s_sub_i32 s2, s5, s2
+; GFX942-IR-NEXT:    s_bfe_i32 s2, s2, 0x180000
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
   %result = srem i64 %1, %2
@@ -644,6 +1211,56 @@ define i64 @v_test_srem24_64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_srem24_64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v0, 8, v3
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, v0
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 8, v1
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v3, v1
+; GFX942-NEXT:    v_xor_b32_e32 v5, v1, v0
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GFX942-NEXT:    v_ashrrev_i32_e32 v5, 30, v5
+; GFX942-NEXT:    v_or_b32_e32 v5, 1, v5
+; GFX942-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX942-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v6, v4
+; GFX942-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
+; GFX942-NEXT:    v_add_u32_e32 v2, v6, v2
+; GFX942-NEXT:    v_mul_lo_u32 v0, v2, v0
+; GFX942-NEXT:    v_sub_u32_e32 v0, v1, v0
+; GFX942-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_srem24_64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v0, 8, v3
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v2, v0
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 8, v1
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v3, v1
+; GFX942-IR-NEXT:    v_xor_b32_e32 v5, v1, v0
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v5, 30, v5
+; GFX942-IR-NEXT:    v_or_b32_e32 v5, 1, v5
+; GFX942-IR-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v6, v4
+; GFX942-IR-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_cndmask_b32_e32 v2, 0, v5, vcc
+; GFX942-IR-NEXT:    v_add_u32_e32 v2, v6, v2
+; GFX942-IR-NEXT:    v_mul_lo_u32 v0, v2, v0
+; GFX942-IR-NEXT:    v_sub_u32_e32 v0, v1, v0
+; GFX942-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
   %result = srem i64 %1, %2
@@ -732,6 +1349,82 @@ define amdgpu_kernel void @s_test_srem25_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_srem25_64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s0, s1, 7
+; GFX942-NEXT:    s_abs_i32 s6, s0
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_sub_i32 s4, 0, s6
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s2, s3, 7
+; GFX942-NEXT:    s_abs_i32 s2, s2
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX942-NEXT:    s_mul_i32 s4, s4, s6
+; GFX942-NEXT:    s_sub_i32 s2, s2, s4
+; GFX942-NEXT:    s_sub_i32 s4, s2, s6
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX942-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX942-NEXT:    s_sub_i32 s4, s2, s6
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX942-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX942-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_srem25_64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s0, s1, 7
+; GFX942-IR-NEXT:    s_abs_i32 s6, s0
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_sub_i32 s4, 0, s6
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s3, 7
+; GFX942-IR-NEXT:    s_abs_i32 s2, s2
+; GFX942-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-IR-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-IR-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-IR-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-IR-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-IR-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX942-IR-NEXT:    s_mul_i32 s4, s4, s6
+; GFX942-IR-NEXT:    s_sub_i32 s2, s2, s4
+; GFX942-IR-NEXT:    s_sub_i32 s4, s2, s6
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX942-IR-NEXT:    s_sub_i32 s4, s2, s6
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX942-IR-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-IR-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 39
   %2 = ashr i64 %y, 39
   %result = srem i64 %1, %2
@@ -821,6 +1514,82 @@ define amdgpu_kernel void @s_test_srem31_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_srem31_64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s0, s1, 1
+; GFX942-NEXT:    s_abs_i32 s6, s0
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_sub_i32 s4, 0, s6
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s2, s3, 1
+; GFX942-NEXT:    s_abs_i32 s2, s2
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX942-NEXT:    s_mul_i32 s4, s4, s6
+; GFX942-NEXT:    s_sub_i32 s2, s2, s4
+; GFX942-NEXT:    s_sub_i32 s4, s2, s6
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX942-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX942-NEXT:    s_sub_i32 s4, s2, s6
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX942-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX942-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_srem31_64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s0, s1, 1
+; GFX942-IR-NEXT:    s_abs_i32 s6, s0
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_sub_i32 s4, 0, s6
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s3, 1
+; GFX942-IR-NEXT:    s_abs_i32 s2, s2
+; GFX942-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-IR-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-IR-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-IR-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-IR-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-IR-NEXT:    s_mul_hi_u32 s4, s2, s5
+; GFX942-IR-NEXT:    s_mul_i32 s4, s4, s6
+; GFX942-IR-NEXT:    s_sub_i32 s2, s2, s4
+; GFX942-IR-NEXT:    s_sub_i32 s4, s2, s6
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX942-IR-NEXT:    s_sub_i32 s4, s2, s6
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX942-IR-NEXT:    s_xor_b32 s2, s2, s3
+; GFX942-IR-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 33
   %2 = ashr i64 %y, 33
   %result = srem i64 %1, %2
@@ -907,6 +1676,80 @@ define amdgpu_kernel void @s_test_srem32_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_srem32_64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x38
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_abs_i32 s6, s0
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_sub_i32 s4, 0, s6
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s2, s3, 31
+; GFX942-NEXT:    s_abs_i32 s3, s3
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-NEXT:    s_mul_hi_u32 s4, s3, s5
+; GFX942-NEXT:    s_mul_i32 s4, s4, s6
+; GFX942-NEXT:    s_sub_i32 s3, s3, s4
+; GFX942-NEXT:    s_sub_i32 s4, s3, s6
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX942-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX942-NEXT:    s_sub_i32 s4, s3, s6
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX942-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX942-NEXT:    s_xor_b32 s3, s3, s2
+; GFX942-NEXT:    s_sub_i32 s2, s3, s2
+; GFX942-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_srem32_64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dword s0, s[4:5], 0x38
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_abs_i32 s6, s0
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_sub_i32 s4, 0, s6
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s3, 31
+; GFX942-IR-NEXT:    s_abs_i32 s3, s3
+; GFX942-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-IR-NEXT:    s_mul_i32 s4, s4, s5
+; GFX942-IR-NEXT:    s_mul_hi_u32 s4, s5, s4
+; GFX942-IR-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-IR-NEXT:    s_mul_hi_u32 s4, s3, s5
+; GFX942-IR-NEXT:    s_mul_i32 s4, s4, s6
+; GFX942-IR-NEXT:    s_sub_i32 s3, s3, s4
+; GFX942-IR-NEXT:    s_sub_i32 s4, s3, s6
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX942-IR-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX942-IR-NEXT:    s_sub_i32 s4, s3, s6
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX942-IR-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX942-IR-NEXT:    s_xor_b32 s3, s3, s2
+; GFX942-IR-NEXT:    s_sub_i32 s2, s3, s2
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 32
   %2 = ashr i64 %y, 32
   %result = srem i64 %1, %2
@@ -1146,6 +1989,251 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s5
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_srem33_64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x34
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i64 s[2:3], s[10:11], 31
+; GFX942-NEXT:    s_ashr_i64 s[0:1], s[0:1], 31
+; GFX942-NEXT:    s_ashr_i32 s4, s1, 31
+; GFX942-NEXT:    s_add_u32 s0, s0, s4
+; GFX942-NEXT:    s_mov_b32 s5, s4
+; GFX942-NEXT:    s_addc_u32 s1, s1, s4
+; GFX942-NEXT:    s_xor_b64 s[4:5], s[0:1], s[4:5]
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s4
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s5
+; GFX942-NEXT:    s_sub_u32 s0, 0, s4
+; GFX942-NEXT:    s_subb_u32 s1, 0, s5
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_fmamk_f32 v1, v2, 0xcf800000, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s7, v1
+; GFX942-NEXT:    s_mul_i32 s10, s0, s6
+; GFX942-NEXT:    s_mul_hi_u32 s12, s0, s7
+; GFX942-NEXT:    s_mul_i32 s11, s1, s7
+; GFX942-NEXT:    s_add_i32 s10, s12, s10
+; GFX942-NEXT:    s_mul_i32 s13, s0, s7
+; GFX942-NEXT:    s_add_i32 s10, s10, s11
+; GFX942-NEXT:    s_mul_hi_u32 s11, s7, s10
+; GFX942-NEXT:    s_mul_i32 s12, s7, s10
+; GFX942-NEXT:    s_mul_hi_u32 s7, s7, s13
+; GFX942-NEXT:    s_add_u32 s7, s7, s12
+; GFX942-NEXT:    s_addc_u32 s11, 0, s11
+; GFX942-NEXT:    s_mul_hi_u32 s14, s6, s13
+; GFX942-NEXT:    s_mul_i32 s13, s6, s13
+; GFX942-NEXT:    s_add_u32 s7, s7, s13
+; GFX942-NEXT:    s_mul_hi_u32 s12, s6, s10
+; GFX942-NEXT:    s_addc_u32 s7, s11, s14
+; GFX942-NEXT:    s_addc_u32 s11, s12, 0
+; GFX942-NEXT:    s_mul_i32 s10, s6, s10
+; GFX942-NEXT:    s_add_u32 s7, s7, s10
+; GFX942-NEXT:    s_addc_u32 s10, 0, s11
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s7, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s6, s6, s10
+; GFX942-NEXT:    v_readfirstlane_b32 s10, v1
+; GFX942-NEXT:    s_mul_i32 s7, s0, s6
+; GFX942-NEXT:    s_mul_hi_u32 s11, s0, s10
+; GFX942-NEXT:    s_add_i32 s7, s11, s7
+; GFX942-NEXT:    s_mul_i32 s1, s1, s10
+; GFX942-NEXT:    s_add_i32 s7, s7, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s10
+; GFX942-NEXT:    s_mul_hi_u32 s11, s6, s0
+; GFX942-NEXT:    s_mul_i32 s12, s6, s0
+; GFX942-NEXT:    s_mul_i32 s14, s10, s7
+; GFX942-NEXT:    s_mul_hi_u32 s0, s10, s0
+; GFX942-NEXT:    s_mul_hi_u32 s13, s10, s7
+; GFX942-NEXT:    s_add_u32 s0, s0, s14
+; GFX942-NEXT:    s_addc_u32 s10, 0, s13
+; GFX942-NEXT:    s_add_u32 s0, s0, s12
+; GFX942-NEXT:    s_mul_hi_u32 s1, s6, s7
+; GFX942-NEXT:    s_addc_u32 s0, s10, s11
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s7, s6, s7
+; GFX942-NEXT:    s_add_u32 s0, s0, s7
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s12, s6, s1
+; GFX942-NEXT:    s_ashr_i32 s6, s3, 31
+; GFX942-NEXT:    s_add_u32 s0, s2, s6
+; GFX942-NEXT:    s_mov_b32 s7, s6
+; GFX942-NEXT:    s_addc_u32 s1, s3, s6
+; GFX942-NEXT:    s_xor_b64 s[10:11], s[0:1], s[6:7]
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX942-NEXT:    s_mul_i32 s1, s10, s12
+; GFX942-NEXT:    s_mul_hi_u32 s3, s10, s2
+; GFX942-NEXT:    s_mul_hi_u32 s0, s10, s12
+; GFX942-NEXT:    s_add_u32 s1, s3, s1
+; GFX942-NEXT:    s_addc_u32 s0, 0, s0
+; GFX942-NEXT:    s_mul_hi_u32 s7, s11, s2
+; GFX942-NEXT:    s_mul_i32 s2, s11, s2
+; GFX942-NEXT:    s_add_u32 s1, s1, s2
+; GFX942-NEXT:    s_mul_hi_u32 s3, s11, s12
+; GFX942-NEXT:    s_addc_u32 s0, s0, s7
+; GFX942-NEXT:    s_addc_u32 s1, s3, 0
+; GFX942-NEXT:    s_mul_i32 s2, s11, s12
+; GFX942-NEXT:    s_add_u32 s0, s0, s2
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    s_mul_i32 s1, s4, s1
+; GFX942-NEXT:    s_mul_hi_u32 s2, s4, s0
+; GFX942-NEXT:    s_add_i32 s1, s2, s1
+; GFX942-NEXT:    s_mul_i32 s2, s5, s0
+; GFX942-NEXT:    s_mul_i32 s0, s4, s0
+; GFX942-NEXT:    s_add_i32 s7, s1, s2
+; GFX942-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NEXT:    s_sub_i32 s1, s11, s7
+; GFX942-NEXT:    v_sub_co_u32_e32 v1, vcc, s10, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s10, s1, s5
+; GFX942-NEXT:    v_subrev_co_u32_e64 v2, s[0:1], s4, v1
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s12, s10, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s12, s5
+; GFX942-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[2:3], s4, v2
+; GFX942-NEXT:    s_cmp_eq_u32 s12, s5
+; GFX942-NEXT:    v_mov_b32_e32 v4, s13
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[2:3]
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[2:3]
+; GFX942-NEXT:    s_subb_u32 s2, s10, s5
+; GFX942-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s4, v2
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s2, s2, 0
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
+; GFX942-NEXT:    v_mov_b32_e32 v3, s12
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s11, s7
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s5
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s4, v1
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s5
+; GFX942-NEXT:    v_mov_b32_e32 v5, s1
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX942-NEXT:    v_mov_b32_e32 v5, s0
+; GFX942-NEXT:    v_mov_b32_e32 v4, s6
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX942-NEXT:    v_xor_b32_e32 v1, s6, v1
+; GFX942-NEXT:    v_xor_b32_e32 v3, s6, v3
+; GFX942-NEXT:    v_subrev_co_u32_e32 v2, vcc, s6, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v4, vcc
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_srem33_64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-IR-NEXT:    s_mov_b64 s[8:9], 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i64 s[4:5], s[2:3], 31
+; GFX942-IR-NEXT:    s_ashr_i32 s2, s5, 31
+; GFX942-IR-NEXT:    s_mov_b32 s3, s2
+; GFX942-IR-NEXT:    s_ashr_i64 s[6:7], s[6:7], 31
+; GFX942-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX942-IR-NEXT:    s_sub_u32 s4, s4, s2
+; GFX942-IR-NEXT:    s_subb_u32 s5, s5, s2
+; GFX942-IR-NEXT:    s_ashr_i32 s10, s7, 31
+; GFX942-IR-NEXT:    s_mov_b32 s11, s10
+; GFX942-IR-NEXT:    s_xor_b64 s[6:7], s[6:7], s[10:11]
+; GFX942-IR-NEXT:    s_sub_u32 s6, s6, s10
+; GFX942-IR-NEXT:    s_subb_u32 s7, s7, s10
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[6:7], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[4:5], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX942-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[12:13]
+; GFX942-IR-NEXT:    s_flbit_i32_b64 s12, s[6:7]
+; GFX942-IR-NEXT:    s_flbit_i32_b64 s20, s[4:5]
+; GFX942-IR-NEXT:    s_sub_u32 s14, s12, s20
+; GFX942-IR-NEXT:    s_subb_u32 s15, 0, 0
+; GFX942-IR-NEXT:    v_cmp_gt_u64_e64 s[16:17], s[14:15], 63
+; GFX942-IR-NEXT:    s_or_b64 s[16:17], s[10:11], s[16:17]
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[14:15], 63
+; GFX942-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
+; GFX942-IR-NEXT:    s_and_b64 s[10:11], s[16:17], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s11, 0, s5
+; GFX942-IR-NEXT:    s_cselect_b32 s10, 0, s4
+; GFX942-IR-NEXT:    s_or_b64 s[16:17], s[16:17], s[18:19]
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[16:17]
+; GFX942-IR-NEXT:    s_mov_b32 s13, 0
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB8_5
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    s_add_u32 s16, s14, 1
+; GFX942-IR-NEXT:    s_addc_u32 s17, s15, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[16:17], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX942-IR-NEXT:    s_sub_i32 s14, 63, s14
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
+; GFX942-IR-NEXT:    s_lshl_b64 s[10:11], s[4:5], s14
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB8_4
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    s_lshr_b64 s[14:15], s[4:5], s16
+; GFX942-IR-NEXT:    s_add_u32 s18, s6, -1
+; GFX942-IR-NEXT:    s_addc_u32 s19, s7, -1
+; GFX942-IR-NEXT:    s_not_b64 s[8:9], s[12:13]
+; GFX942-IR-NEXT:    s_add_u32 s12, s8, s20
+; GFX942-IR-NEXT:    s_addc_u32 s13, s9, 0
+; GFX942-IR-NEXT:    s_mov_b64 s[16:17], 0
+; GFX942-IR-NEXT:    s_mov_b32 s9, 0
+; GFX942-IR-NEXT:  .LBB8_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    s_lshl_b64 s[14:15], s[14:15], 1
+; GFX942-IR-NEXT:    s_lshr_b32 s8, s11, 31
+; GFX942-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
+; GFX942-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[8:9]
+; GFX942-IR-NEXT:    s_or_b64 s[10:11], s[16:17], s[10:11]
+; GFX942-IR-NEXT:    s_sub_u32 s8, s18, s14
+; GFX942-IR-NEXT:    s_subb_u32 s8, s19, s15
+; GFX942-IR-NEXT:    s_ashr_i32 s16, s8, 31
+; GFX942-IR-NEXT:    s_mov_b32 s17, s16
+; GFX942-IR-NEXT:    s_and_b32 s8, s16, 1
+; GFX942-IR-NEXT:    s_and_b64 s[16:17], s[16:17], s[6:7]
+; GFX942-IR-NEXT:    s_sub_u32 s14, s14, s16
+; GFX942-IR-NEXT:    s_subb_u32 s15, s15, s17
+; GFX942-IR-NEXT:    s_add_u32 s12, s12, 1
+; GFX942-IR-NEXT:    s_addc_u32 s13, s13, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[12:13], 0
+; GFX942-IR-NEXT:    s_mov_b64 s[16:17], s[8:9]
+; GFX942-IR-NEXT:    s_cbranch_scc0 .LBB8_3
+; GFX942-IR-NEXT:  .LBB8_4: ; %Flow7
+; GFX942-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
+; GFX942-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
+; GFX942-IR-NEXT:  .LBB8_5: ; %udiv-end
+; GFX942-IR-NEXT:    s_mul_i32 s8, s6, s11
+; GFX942-IR-NEXT:    s_mul_hi_u32 s9, s6, s10
+; GFX942-IR-NEXT:    s_add_i32 s8, s9, s8
+; GFX942-IR-NEXT:    s_mul_i32 s7, s7, s10
+; GFX942-IR-NEXT:    s_add_i32 s8, s8, s7
+; GFX942-IR-NEXT:    s_mul_i32 s6, s6, s10
+; GFX942-IR-NEXT:    s_sub_u32 s4, s4, s6
+; GFX942-IR-NEXT:    s_subb_u32 s5, s5, s8
+; GFX942-IR-NEXT:    s_xor_b64 s[4:5], s[4:5], s[2:3]
+; GFX942-IR-NEXT:    s_sub_u32 s2, s4, s2
+; GFX942-IR-NEXT:    s_subb_u32 s3, s5, s3
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-IR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = ashr i64 %x, 31
   %2 = ashr i64 %y, 31
   %result = srem i64 %1, %2
@@ -1225,6 +2313,78 @@ define amdgpu_kernel void @s_test_srem24_48(ptr addrspace(1) %out, i48 %x, i48 %
 ; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-IR-NEXT:    buffer_store_short v1, off, s[4:7], 0 offset:4
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_srem24_48:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s4, s3
+; GFX942-NEXT:    s_mov_b32 s8, s7
+; GFX942-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
+; GFX942-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-NEXT:    v_alignbit_b32 v1, s8, v1, 24
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, v1
+; GFX942-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GFX942-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-NEXT:    v_alignbit_b32 v3, s4, v3, 24
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v4, v3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX942-NEXT:    v_xor_b32_e32 v6, v3, v1
+; GFX942-NEXT:    v_ashrrev_i32_e32 v6, 30, v6
+; GFX942-NEXT:    v_or_b32_e32 v6, 1, v6
+; GFX942-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX942-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v7, v5
+; GFX942-NEXT:    v_fma_f32 v4, -v5, v2, v4
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
+; GFX942-NEXT:    v_add_u32_e32 v2, v7, v2
+; GFX942-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX942-NEXT:    v_sub_u32_e32 v1, v3, v1
+; GFX942-NEXT:    v_bfe_i32 v1, v1, 0, 24
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-NEXT:    global_store_short v0, v2, s[0:1] offset:4
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_srem24_48:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-IR-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_mov_b32 s4, s3
+; GFX942-IR-NEXT:    s_mov_b32 s8, s7
+; GFX942-IR-NEXT:    s_bfe_i64 s[8:9], s[8:9], 0x100000
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-IR-NEXT:    v_alignbit_b32 v1, s8, v1, 24
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v2, v1
+; GFX942-IR-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x100000
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, s2
+; GFX942-IR-NEXT:    v_alignbit_b32 v3, s4, v3, 24
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v4, v3
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v5, v2
+; GFX942-IR-NEXT:    v_xor_b32_e32 v6, v3, v1
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v6, 30, v6
+; GFX942-IR-NEXT:    v_or_b32_e32 v6, 1, v6
+; GFX942-IR-NEXT:    v_mul_f32_e32 v5, v4, v5
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v7, v5
+; GFX942-IR-NEXT:    v_fma_f32 v4, -v5, v2, v4
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v4|, |v2|
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_cndmask_b32_e32 v2, 0, v6, vcc
+; GFX942-IR-NEXT:    v_add_u32_e32 v2, v7, v2
+; GFX942-IR-NEXT:    v_mul_lo_u32 v1, v2, v1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v1, v3, v1
+; GFX942-IR-NEXT:    v_bfe_i32 v1, v1, 0, 24
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-IR-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX942-IR-NEXT:    global_store_short v0, v2, s[0:1] offset:4
+; GFX942-IR-NEXT:    s_endpgm
   %1 = ashr i48 %x, 24
   %2 = ashr i48 %y, 24
   %result = srem i48 %1, %2
@@ -1419,6 +2579,211 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_srem_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_ashr_i32 s0, s11, 31
+; GFX942-NEXT:    s_add_u32 s2, s10, s0
+; GFX942-NEXT:    s_mov_b32 s1, s0
+; GFX942-NEXT:    s_addc_u32 s3, s11, s0
+; GFX942-NEXT:    s_xor_b64 s[2:3], s[2:3], s[0:1]
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX942-NEXT:    s_sub_u32 s0, 0, s2
+; GFX942-NEXT:    s_subb_u32 s1, 0, s3
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX942-NEXT:    s_mul_i32 s6, s0, s4
+; GFX942-NEXT:    s_mul_hi_u32 s10, s0, s5
+; GFX942-NEXT:    s_mul_i32 s7, s1, s5
+; GFX942-NEXT:    s_add_i32 s6, s10, s6
+; GFX942-NEXT:    s_mul_i32 s11, s0, s5
+; GFX942-NEXT:    s_add_i32 s6, s6, s7
+; GFX942-NEXT:    s_mul_hi_u32 s10, s5, s11
+; GFX942-NEXT:    s_mul_hi_u32 s7, s5, s6
+; GFX942-NEXT:    s_mul_i32 s5, s5, s6
+; GFX942-NEXT:    s_add_u32 s5, s10, s5
+; GFX942-NEXT:    s_addc_u32 s7, 0, s7
+; GFX942-NEXT:    s_mul_hi_u32 s10, s4, s11
+; GFX942-NEXT:    s_mul_i32 s11, s4, s11
+; GFX942-NEXT:    s_add_u32 s5, s5, s11
+; GFX942-NEXT:    s_mul_hi_u32 s12, s4, s6
+; GFX942-NEXT:    s_addc_u32 s5, s7, s10
+; GFX942-NEXT:    s_addc_u32 s7, s12, 0
+; GFX942-NEXT:    s_mul_i32 s6, s4, s6
+; GFX942-NEXT:    s_add_u32 s5, s5, s6
+; GFX942-NEXT:    s_addc_u32 s6, 0, s7
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, s5, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s4, s4, s6
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v0
+; GFX942-NEXT:    s_mul_i32 s5, s0, s4
+; GFX942-NEXT:    s_mul_hi_u32 s7, s0, s6
+; GFX942-NEXT:    s_add_i32 s5, s7, s5
+; GFX942-NEXT:    s_mul_i32 s1, s1, s6
+; GFX942-NEXT:    s_add_i32 s5, s5, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s6
+; GFX942-NEXT:    s_mul_hi_u32 s7, s4, s0
+; GFX942-NEXT:    s_mul_i32 s10, s4, s0
+; GFX942-NEXT:    s_mul_i32 s12, s6, s5
+; GFX942-NEXT:    s_mul_hi_u32 s0, s6, s0
+; GFX942-NEXT:    s_mul_hi_u32 s11, s6, s5
+; GFX942-NEXT:    s_add_u32 s0, s0, s12
+; GFX942-NEXT:    s_addc_u32 s6, 0, s11
+; GFX942-NEXT:    s_add_u32 s0, s0, s10
+; GFX942-NEXT:    s_mul_hi_u32 s1, s4, s5
+; GFX942-NEXT:    s_addc_u32 s0, s6, s7
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s5, s4, s5
+; GFX942-NEXT:    s_add_u32 s0, s0, s5
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s0, s4, s1
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    s_mul_hi_u32 s1, s0, 24
+; GFX942-NEXT:    s_mul_i32 s0, s0, 24
+; GFX942-NEXT:    s_mul_hi_u32 s4, s4, 24
+; GFX942-NEXT:    s_add_u32 s0, s4, s0
+; GFX942-NEXT:    s_addc_u32 s0, 0, s1
+; GFX942-NEXT:    s_mul_i32 s1, s3, s0
+; GFX942-NEXT:    s_mul_hi_u32 s4, s2, s0
+; GFX942-NEXT:    s_add_i32 s6, s4, s1
+; GFX942-NEXT:    s_mul_i32 s0, s2, s0
+; GFX942-NEXT:    s_sub_i32 s1, 0, s6
+; GFX942-NEXT:    v_sub_co_u32_e64 v0, s[4:5], 24, s0
+; GFX942-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX942-NEXT:    s_subb_u32 s7, s1, s3
+; GFX942-NEXT:    v_subrev_co_u32_e32 v1, vcc, s2, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s10, s7, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s10, s3
+; GFX942-NEXT:    s_cselect_b32 s11, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v1
+; GFX942-NEXT:    s_cmp_eq_u32 s10, s3
+; GFX942-NEXT:    v_mov_b32_e32 v4, s11
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s7, s3
+; GFX942-NEXT:    v_subrev_co_u32_e32 v4, vcc, s2, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s0, s0, 0
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX942-NEXT:    s_cmp_lg_u64 s[4:5], 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v1, v4, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, s0
+; GFX942-NEXT:    s_subb_u32 s0, 0, s6
+; GFX942-NEXT:    v_mov_b32_e32 v1, s10
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s2, v0
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s3
+; GFX942-NEXT:    v_mov_b32_e32 v5, s1
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, s0
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_srem_k_num_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_ashr_i32 s6, s3, 31
+; GFX942-IR-NEXT:    s_mov_b32 s7, s6
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], s[2:3], s[6:7]
+; GFX942-IR-NEXT:    s_sub_u32 s2, s2, s6
+; GFX942-IR-NEXT:    s_subb_u32 s3, s3, s6
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX942-IR-NEXT:    s_flbit_i32_b64 s12, s[2:3]
+; GFX942-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX942-IR-NEXT:    s_add_u32 s8, s12, 0xffffffc5
+; GFX942-IR-NEXT:    s_addc_u32 s9, 0, -1
+; GFX942-IR-NEXT:    v_cmp_gt_u64_e64 s[10:11], s[8:9], 63
+; GFX942-IR-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[8:9], 63
+; GFX942-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX942-IR-NEXT:    s_and_b64 s[6:7], s[10:11], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s6, 0, 24
+; GFX942-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
+; GFX942-IR-NEXT:    s_mov_b32 s7, 0
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB10_5
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    s_add_u32 s10, s8, 1
+; GFX942-IR-NEXT:    s_addc_u32 s11, s9, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX942-IR-NEXT:    s_sub_i32 s8, 63, s8
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GFX942-IR-NEXT:    s_lshl_b64 s[6:7], 24, s8
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB10_4
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    s_lshr_b64 s[10:11], 24, s10
+; GFX942-IR-NEXT:    s_add_u32 s14, s2, -1
+; GFX942-IR-NEXT:    s_addc_u32 s15, s3, -1
+; GFX942-IR-NEXT:    s_sub_u32 s8, 58, s12
+; GFX942-IR-NEXT:    s_subb_u32 s9, 0, 0
+; GFX942-IR-NEXT:    s_mov_b64 s[12:13], 0
+; GFX942-IR-NEXT:    s_mov_b32 s5, 0
+; GFX942-IR-NEXT:  .LBB10_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
+; GFX942-IR-NEXT:    s_lshr_b32 s4, s7, 31
+; GFX942-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX942-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[4:5]
+; GFX942-IR-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
+; GFX942-IR-NEXT:    s_sub_u32 s4, s14, s10
+; GFX942-IR-NEXT:    s_subb_u32 s4, s15, s11
+; GFX942-IR-NEXT:    s_ashr_i32 s12, s4, 31
+; GFX942-IR-NEXT:    s_mov_b32 s13, s12
+; GFX942-IR-NEXT:    s_and_b32 s4, s12, 1
+; GFX942-IR-NEXT:    s_and_b64 s[12:13], s[12:13], s[2:3]
+; GFX942-IR-NEXT:    s_sub_u32 s10, s10, s12
+; GFX942-IR-NEXT:    s_subb_u32 s11, s11, s13
+; GFX942-IR-NEXT:    s_add_u32 s8, s8, 1
+; GFX942-IR-NEXT:    s_addc_u32 s9, s9, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[8:9], 0
+; GFX942-IR-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_scc0 .LBB10_3
+; GFX942-IR-NEXT:  .LBB10_4: ; %Flow6
+; GFX942-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX942-IR-NEXT:    s_or_b64 s[6:7], s[4:5], s[6:7]
+; GFX942-IR-NEXT:  .LBB10_5: ; %udiv-end
+; GFX942-IR-NEXT:    s_mul_i32 s4, s2, s7
+; GFX942-IR-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX942-IR-NEXT:    s_add_i32 s4, s5, s4
+; GFX942-IR-NEXT:    s_mul_i32 s3, s3, s6
+; GFX942-IR-NEXT:    s_add_i32 s4, s4, s3
+; GFX942-IR-NEXT:    s_mul_i32 s2, s2, s6
+; GFX942-IR-NEXT:    s_sub_u32 s2, 24, s2
+; GFX942-IR-NEXT:    s_subb_u32 s3, 0, s4
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-IR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %result = srem i64 24, %x
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -1615,6 +2980,193 @@ define i64 @v_test_srem_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_srem_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT:    v_xor_b32_e32 v10, v1, v2
+; GFX942-NEXT:    v_xor_b32_e32 v11, v0, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, v11
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v10
+; GFX942-NEXT:    v_sub_co_u32_e32 v9, vcc, 0, v11
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX942-NEXT:    v_subb_co_u32_e32 v12, vcc, 0, v10, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v8, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v13, v1
+; GFX942-NEXT:    v_mul_lo_u32 v2, v12, v8
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v9, v8, 0
+; GFX942-NEXT:    v_mul_lo_u32 v3, v9, v13
+; GFX942-NEXT:    v_add3_u32 v1, v1, v3, v2
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v8, v1, 0
+; GFX942-NEXT:    v_mul_hi_u32 v4, v8, v0
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3]
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v13, v1, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v13, v0, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v3, v1, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[4:5], 0, v[6:7]
+; GFX942-NEXT:    v_add_co_u32_e32 v14, vcc, v8, v0
+; GFX942-NEXT:    v_mul_lo_u32 v3, v12, v14
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v1, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v2, v9, v13
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v9, v14, 0
+; GFX942-NEXT:    v_add3_u32 v1, v1, v2, v3
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0
+; GFX942-NEXT:    v_mul_hi_u32 v4, v14, v0
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v13, v1, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v13, v0, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[4:5], 0, v[8:9]
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v6
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v7, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[4:5], 0, v[2:3]
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v14, v0
+; GFX942-NEXT:    v_mul_hi_u32 v4, v2, 24
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, v13, v1, vcc
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, 24, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[4:5], 0, v[0:1]
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v11, v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v0, v3
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v10, v1, v[0:1]
+; GFX942-NEXT:    v_sub_u32_e32 v1, 0, v0
+; GFX942-NEXT:    v_sub_co_u32_e32 v2, vcc, 24, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v1, s[0:1], v1, v10, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v3, s[0:1], v2, v11
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v4, s[2:3], 0, v1, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v4, v10
+; GFX942-NEXT:    v_subb_co_u32_e64 v1, s[0:1], v1, v10, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v3, v11
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[2:3], v4, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[2:3]
+; GFX942-NEXT:    v_sub_co_u32_e64 v6, s[0:1], v3, v11
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v1, s[0:1], 0, v1, s[0:1]
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
+; GFX942-NEXT:    v_subb_co_u32_e32 v5, vcc, 0, v0, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v5, v10
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v11
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v5, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_srem_k_num_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX942-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-IR-NEXT:    s_movk_i32 s0, 0xffc5
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX942-IR-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX942-IR-NEXT:    v_min_u32_e32 v8, v2, v3
+; GFX942-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-IR-NEXT:    s_mov_b32 s1, -1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[6:7], v[8:9], 0, s[0:1]
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], 63, v[6:7]
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[6:7]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v2, 24, 0, s[0:1]
+; GFX942-IR-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[0:1], vcc
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB11_6
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[10:11], v[6:7], 0, 1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v2, 63, v6
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[4:5], 0
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], v2, 24
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB11_5
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v8, vcc, 58, v8
+; GFX942-IR-NEXT:    v_lshrrev_b64 v[10:11], v10, 24
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e64 v9, s[4:5], 0, 0, vcc
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, -1
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[12:13], 0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-IR-NEXT:  .LBB11_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[10:11], 1, v[10:11]
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
+; GFX942-IR-NEXT:    v_or_b32_e32 v10, v10, v4
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v4, vcc, v6, v10
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v4, vcc, v7, v11, vcc
+; GFX942-IR-NEXT:    v_or_b32_e32 v2, v12, v2
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v4
+; GFX942-IR-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX942-IR-NEXT:    v_and_b32_e32 v4, 1, v12
+; GFX942-IR-NEXT:    v_and_b32_e32 v13, v12, v1
+; GFX942-IR-NEXT:    v_and_b32_e32 v12, v12, v0
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v10, vcc, v10, v12
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[8:9], v[8:9], 0, 1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v13, vcc
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-IR-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_execnz .LBB11_3
+; GFX942-IR-NEXT:  ; %bb.4: ; %Flow
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:  .LBB11_5: ; %Flow4
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX942-IR-NEXT:    v_or_b32_e32 v9, v5, v3
+; GFX942-IR-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX942-IR-NEXT:  .LBB11_6: ; %Flow5
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-IR-NEXT:    v_mul_lo_u32 v3, v1, v2
+; GFX942-IR-NEXT:    v_mul_lo_u32 v4, v0, v9
+; GFX942-IR-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, v2, 0
+; GFX942-IR-NEXT:    v_add3_u32 v1, v1, v4, v3
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, 24, v0
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 24, %x
   ret i64 %result
 }
@@ -1808,6 +3360,192 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_srem_pow2_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT:    v_xor_b32_e32 v10, v1, v2
+; GFX942-NEXT:    v_xor_b32_e32 v11, v0, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, v11
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v10
+; GFX942-NEXT:    v_sub_co_u32_e32 v9, vcc, 0, v11
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX942-NEXT:    v_subb_co_u32_e32 v12, vcc, 0, v10, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v8, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v13, v1
+; GFX942-NEXT:    v_mul_lo_u32 v2, v12, v8
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v9, v8, 0
+; GFX942-NEXT:    v_mul_lo_u32 v3, v9, v13
+; GFX942-NEXT:    v_add3_u32 v1, v1, v3, v2
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v8, v1, 0
+; GFX942-NEXT:    v_mul_hi_u32 v4, v8, v0
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[4:5], 0, v[2:3]
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v13, v1, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v13, v0, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v2, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v3, v1, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[4:5], 0, v[6:7]
+; GFX942-NEXT:    v_add_co_u32_e32 v14, vcc, v8, v0
+; GFX942-NEXT:    v_mul_lo_u32 v3, v12, v14
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v1, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v2, v9, v13
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v9, v14, 0
+; GFX942-NEXT:    v_add3_u32 v1, v1, v2, v3
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v14, v1, 0
+; GFX942-NEXT:    v_mul_hi_u32 v4, v14, v0
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v13, v1, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v13, v0, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[4:5], 0, v[8:9]
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v0, v6
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v4, vcc, v1, v7, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[4:5], 0, v[2:3]
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, v14, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, v13, v1, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v3, 17, v0
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v11, v3, 0
+; GFX942-NEXT:    v_mov_b32_e32 v2, v1
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v10, v3, v[2:3]
+; GFX942-NEXT:    v_sub_u32_e32 v1, 0, v2
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, 0x8000, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v1, s[0:1], v1, v10, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v3, s[0:1], v0, v11
+; GFX942-NEXT:    v_subb_co_u32_e32 v2, vcc, 0, v2, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v4, s[2:3], 0, v1, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v4, v10
+; GFX942-NEXT:    v_subb_co_u32_e64 v1, s[0:1], v1, v10, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v3, v11
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[2:3], v4, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, v5, v6, s[2:3]
+; GFX942-NEXT:    v_sub_co_u32_e64 v6, s[0:1], v3, v11
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v1, s[0:1], 0, v1, s[0:1]
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v5
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v11
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v3, v6, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v4, v1, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, v5, v6, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v5
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_srem_pow2_k_num_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX942-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-IR-NEXT:    s_movk_i32 s0, 0xffd0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX942-IR-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX942-IR-NEXT:    v_min_u32_e32 v8, v2, v3
+; GFX942-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-IR-NEXT:    s_mov_b32 s1, -1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[6:7], v[8:9], 0, s[0:1]
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], 63, v[6:7]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0x8000
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s[0:1]
+; GFX942-IR-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[6:7]
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[0:1], vcc
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB12_6
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[10:11], v[6:7], 0, 1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v2, 63, v6
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[4:5], 0
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], v2, s[4:5]
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB12_5
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v8, vcc, 47, v8
+; GFX942-IR-NEXT:    v_lshrrev_b64 v[10:11], v10, s[4:5]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e64 v9, s[4:5], 0, 0, vcc
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, -1
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[12:13], 0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-IR-NEXT:  .LBB12_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[10:11], 1, v[10:11]
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
+; GFX942-IR-NEXT:    v_or_b32_e32 v10, v10, v4
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v4, vcc, v6, v10
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v4, vcc, v7, v11, vcc
+; GFX942-IR-NEXT:    v_or_b32_e32 v2, v12, v2
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v4
+; GFX942-IR-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX942-IR-NEXT:    v_and_b32_e32 v4, 1, v12
+; GFX942-IR-NEXT:    v_and_b32_e32 v13, v12, v1
+; GFX942-IR-NEXT:    v_and_b32_e32 v12, v12, v0
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v10, vcc, v10, v12
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[8:9], v[8:9], 0, 1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v13, vcc
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-IR-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_execnz .LBB12_3
+; GFX942-IR-NEXT:  ; %bb.4: ; %Flow
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:  .LBB12_5: ; %Flow4
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX942-IR-NEXT:    v_or_b32_e32 v9, v5, v3
+; GFX942-IR-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX942-IR-NEXT:  .LBB12_6: ; %Flow5
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-IR-NEXT:    v_mul_lo_u32 v3, v1, v2
+; GFX942-IR-NEXT:    v_mul_lo_u32 v4, v0, v9
+; GFX942-IR-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, v2, 0
+; GFX942-IR-NEXT:    v_add3_u32 v1, v1, v4, v3
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, 0x8000, v0
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 32768, %x
   ret i64 %result
 }
@@ -1910,6 +3648,108 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v12
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v13, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_srem_pow2_k_den_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 17, v2
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 0, v[2:3]
+; GFX942-NEXT:    v_and_b32_e32 v2, 0xffff8000, v2
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_srem_pow2_k_den_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
+; GFX942-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX942-IR-NEXT:    v_xor_b32_e32 v1, v1, v2
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v4, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v2, vcc
+; GFX942-IR-NEXT:    v_add_u32_e32 v4, 32, v4
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v5, v1
+; GFX942-IR-NEXT:    v_min_u32_e32 v8, v4, v5
+; GFX942-IR-NEXT:    v_sub_co_u32_e64 v4, s[0:1], 48, v8
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e64 v5, s[0:1], 0, 0, s[0:1]
+; GFX942-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], 63, v[4:5]
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[4:5]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v7, v1, 0, s[0:1]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v6, v0, 0, s[0:1]
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB13_6
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[10:11], v[4:5], 0, 1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v4, 63, v4
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[6:7], 0
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[4:5], v4, v[0:1]
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB13_5
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    s_movk_i32 s4, 0xffcf
+; GFX942-IR-NEXT:    s_mov_b32 s5, -1
+; GFX942-IR-NEXT:    v_lshrrev_b64 v[10:11], v10, v[0:1]
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[8:9], v[8:9], 0, s[4:5]
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[12:13], 0
+; GFX942-IR-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX942-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-IR-NEXT:  .LBB13_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[10:11], 1, v[10:11]
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
+; GFX942-IR-NEXT:    v_or_b32_e32 v10, v10, v6
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v6, vcc, s6, v10
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v6, vcc, 0, v11, vcc
+; GFX942-IR-NEXT:    v_or_b32_e32 v4, v12, v4
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
+; GFX942-IR-NEXT:    v_and_b32_e32 v6, 1, v12
+; GFX942-IR-NEXT:    v_and_b32_e32 v12, 0x8000, v12
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v10, vcc, v10, v12
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[8:9], v[8:9], 0, 1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subbrev_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX942-IR-NEXT:    v_or_b32_e32 v5, v13, v5
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[12:13], v[6:7]
+; GFX942-IR-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_execnz .LBB13_3
+; GFX942-IR-NEXT:  ; %bb.4: ; %Flow
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:  .LBB13_5: ; %Flow4
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
+; GFX942-IR-NEXT:    v_or_b32_e32 v7, v7, v5
+; GFX942-IR-NEXT:    v_or_b32_e32 v6, v6, v4
+; GFX942-IR-NEXT:  .LBB13_6: ; %Flow5
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[4:5], 15, v[6:7]
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX942-IR-NEXT:    v_xor_b32_e32 v0, v0, v2
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX942-IR-NEXT:    v_xor_b32_e32 v1, v1, v3
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = srem i64 %x, 32768
   ret i64 %result
 }
@@ -1978,6 +3818,66 @@ define amdgpu_kernel void @s_test_srem24_k_num_i64(ptr addrspace(1) %out, i64 %x
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_srem24_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s2, 0x41c00000
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_ashr_i32 s4, s3, 8
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-NEXT:    s_or_b32 s5, s3, 1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_fma_f32 v3, -v2, v0, s2
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v3|, |v0|
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s5, 0
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX942-NEXT:    s_add_i32 s2, s3, s2
+; GFX942-NEXT:    s_mul_i32 s2, s2, s4
+; GFX942-NEXT:    s_sub_i32 s2, 24, s2
+; GFX942-NEXT:    s_bfe_i32 s2, s2, 0x180000
+; GFX942-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_srem24_k_num_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_mov_b32 s2, 0x41c00000
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_ashr_i32 s4, s3, 8
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v0, s4
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-IR-NEXT:    s_or_b32 s5, s3, 1
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_fma_f32 v3, -v2, v0, s2
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v3|, |v0|
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s5, 0
+; GFX942-IR-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX942-IR-NEXT:    s_add_i32 s2, s3, s2
+; GFX942-IR-NEXT:    s_mul_i32 s2, s2, s4
+; GFX942-IR-NEXT:    s_sub_i32 s2, 24, s2
+; GFX942-IR-NEXT:    s_bfe_i32 s2, s2, 0x180000
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[2:3], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %x.shr = ashr i64 %x, 40
   %result = srem i64 24, %x.shr
   store i64 %result, ptr addrspace(1) %out
@@ -2046,6 +3946,62 @@ define amdgpu_kernel void @s_test_srem24_k_den_i64(ptr addrspace(1) %out, i64 %x
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s1
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_srem24_k_den_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s2, 0x46b6fe00
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_ashr_i32 s4, s3, 8
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v1, s4
+; GFX942-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-NEXT:    s_or_b32 s5, s3, 1
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x38331158, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_fmamk_f32 v1, v2, 0xc6b6fe00, v1
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX942-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, s2
+; GFX942-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-NEXT:    s_cselect_b32 s2, s5, 0
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX942-NEXT:    s_add_i32 s2, s3, s2
+; GFX942-NEXT:    s_mulk_i32 s2, 0x5b7f
+; GFX942-NEXT:    s_sub_i32 s2, s4, s2
+; GFX942-NEXT:    s_bfe_i32 s2, s2, 0x180000
+; GFX942-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_srem24_k_den_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_mov_b32 s2, 0x46b6fe00
+; GFX942-IR-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-IR-NEXT:    s_ashr_i32 s4, s3, 8
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v1, s4
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s3, 31
+; GFX942-IR-NEXT:    s_or_b32 s5, s3, 1
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, 0x38331158, v1
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_fmamk_f32 v1, v2, 0xc6b6fe00, v1
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 s[2:3], |v1|, s2
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s5, 0
+; GFX942-IR-NEXT:    v_readfirstlane_b32 s3, v2
+; GFX942-IR-NEXT:    s_add_i32 s2, s3, s2
+; GFX942-IR-NEXT:    s_mulk_i32 s2, 0x5b7f
+; GFX942-IR-NEXT:    s_sub_i32 s2, s4, s2
+; GFX942-IR-NEXT:    s_bfe_i32 s2, s2, 0x180000
+; GFX942-IR-NEXT:    s_ashr_i32 s3, s2, 31
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, s3
+; GFX942-IR-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %x.shr = ashr i64 %x, 40
   %result = srem i64 %x.shr, 23423
   store i64 %result, ptr addrspace(1) %out
@@ -2096,6 +4052,54 @@ define i64 @v_test_srem24_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_srem24_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v0, 8, v1
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, v0
+; GFX942-NEXT:    s_mov_b32 s0, 0x41c00000
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX942-NEXT:    v_or_b32_e32 v1, 1, v1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v3, 0x41c00000, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v4, v3
+; GFX942-NEXT:    v_fma_f32 v3, -v3, v2, s0
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-NEXT:    v_add_u32_e32 v1, v4, v1
+; GFX942-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX942-NEXT:    v_sub_u32_e32 v0, 24, v0
+; GFX942-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_srem24_k_num_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v0, 8, v1
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v2, v0
+; GFX942-IR-NEXT:    s_mov_b32 s0, 0x41c00000
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX942-IR-NEXT:    v_or_b32_e32 v1, 1, v1
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v3, 0x41c00000, v3
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v4, v3
+; GFX942-IR-NEXT:    v_fma_f32 v3, -v3, v2, s0
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-IR-NEXT:    v_add_u32_e32 v1, v4, v1
+; GFX942-IR-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX942-IR-NEXT:    v_sub_u32_e32 v0, 24, v0
+; GFX942-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = srem i64 24, %x.shr
   ret i64 %result
@@ -2145,6 +4149,54 @@ define i64 @v_test_srem24_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_srem24_pow2_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v0, 8, v1
+; GFX942-NEXT:    v_cvt_f32_i32_e32 v2, v0
+; GFX942-NEXT:    s_mov_b32 s0, 0x47000000
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX942-NEXT:    v_or_b32_e32 v1, 1, v1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v3, 0x47000000, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_cvt_i32_f32_e32 v4, v3
+; GFX942-NEXT:    v_fma_f32 v3, -v3, v2, s0
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-NEXT:    v_add_u32_e32 v1, v4, v1
+; GFX942-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX942-NEXT:    v_sub_u32_e32 v0, 0x8000, v0
+; GFX942-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_srem24_pow2_k_num_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v0, 8, v1
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v2, v0
+; GFX942-IR-NEXT:    s_mov_b32 s0, 0x47000000
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX942-IR-NEXT:    v_or_b32_e32 v1, 1, v1
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v2
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v3, 0x47000000, v3
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v4, v3
+; GFX942-IR-NEXT:    v_fma_f32 v3, -v3, v2, s0
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, |v2|
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-IR-NEXT:    v_add_u32_e32 v1, v4, v1
+; GFX942-IR-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX942-IR-NEXT:    v_sub_u32_e32 v0, 0x8000, v0
+; GFX942-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = srem i64 32768, %x.shr
   ret i64 %result
@@ -2184,6 +4236,41 @@ define i64 @v_test_srem24_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
 ; GCN-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_srem24_pow2_k_den_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_ashrrev_i32_e32 v3, 31, v1
+; GFX942-NEXT:    v_ashrrev_i32_e32 v2, 8, v1
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 17, v3
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff8000, v0
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v2, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v3, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_srem24_pow2_k_den_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v0, 8, v1
+; GFX942-IR-NEXT:    v_cvt_f32_i32_e32 v2, v0
+; GFX942-IR-NEXT:    s_mov_b32 s0, 0x47000000
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v1
+; GFX942-IR-NEXT:    v_or_b32_e32 v1, 1, v1
+; GFX942-IR-NEXT:    v_mul_f32_e32 v3, 0x38000000, v2
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-IR-NEXT:    v_cvt_i32_f32_e32 v4, v3
+; GFX942-IR-NEXT:    v_fmamk_f32 v2, v3, 0xc7000000, v2
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, s0
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-IR-NEXT:    v_add_lshl_u32 v1, v4, v1, 15
+; GFX942-IR-NEXT:    v_sub_u32_e32 v0, v0, v1
+; GFX942-IR-NEXT:    v_bfe_i32 v0, v0, 0, 24
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = ashr i64 %x, 40
   %result = srem i64 %x.shr, 32768
   ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index bc9a3f2389e7e..9632f27d6cf2d 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GCN-IR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GFX942-IR %s
 
 define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_udiv_i64:
@@ -192,6 +194,213 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s9
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_udiv_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[2:3], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX942-NEXT:    s_sub_u32 s0, 0, s2
+; GFX942-NEXT:    s_subb_u32 s1, 0, s3
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_fmamk_f32 v1, v2, 0xcf800000, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-NEXT:    s_mul_i32 s6, s0, s4
+; GFX942-NEXT:    s_mul_hi_u32 s12, s0, s5
+; GFX942-NEXT:    s_mul_i32 s7, s1, s5
+; GFX942-NEXT:    s_add_i32 s6, s12, s6
+; GFX942-NEXT:    s_mul_i32 s13, s0, s5
+; GFX942-NEXT:    s_add_i32 s6, s6, s7
+; GFX942-NEXT:    s_mul_hi_u32 s12, s5, s13
+; GFX942-NEXT:    s_mul_hi_u32 s7, s5, s6
+; GFX942-NEXT:    s_mul_i32 s5, s5, s6
+; GFX942-NEXT:    s_add_u32 s5, s12, s5
+; GFX942-NEXT:    s_addc_u32 s7, 0, s7
+; GFX942-NEXT:    s_mul_hi_u32 s12, s4, s13
+; GFX942-NEXT:    s_mul_i32 s13, s4, s13
+; GFX942-NEXT:    s_add_u32 s5, s5, s13
+; GFX942-NEXT:    s_mul_hi_u32 s14, s4, s6
+; GFX942-NEXT:    s_addc_u32 s5, s7, s12
+; GFX942-NEXT:    s_addc_u32 s7, s14, 0
+; GFX942-NEXT:    s_mul_i32 s6, s4, s6
+; GFX942-NEXT:    s_add_u32 s5, s5, s6
+; GFX942-NEXT:    s_addc_u32 s6, 0, s7
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s5, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s4, s4, s6
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX942-NEXT:    s_mul_i32 s5, s0, s4
+; GFX942-NEXT:    s_mul_hi_u32 s7, s0, s6
+; GFX942-NEXT:    s_add_i32 s5, s7, s5
+; GFX942-NEXT:    s_mul_i32 s1, s1, s6
+; GFX942-NEXT:    s_add_i32 s5, s5, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s6
+; GFX942-NEXT:    s_mul_hi_u32 s7, s4, s0
+; GFX942-NEXT:    s_mul_i32 s12, s4, s0
+; GFX942-NEXT:    s_mul_i32 s14, s6, s5
+; GFX942-NEXT:    s_mul_hi_u32 s0, s6, s0
+; GFX942-NEXT:    s_mul_hi_u32 s13, s6, s5
+; GFX942-NEXT:    s_add_u32 s0, s0, s14
+; GFX942-NEXT:    s_addc_u32 s6, 0, s13
+; GFX942-NEXT:    s_add_u32 s0, s0, s12
+; GFX942-NEXT:    s_mul_hi_u32 s1, s4, s5
+; GFX942-NEXT:    s_addc_u32 s0, s6, s7
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s5, s4, s5
+; GFX942-NEXT:    s_add_u32 s0, s0, s5
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s0, s4, s1
+; GFX942-NEXT:    v_readfirstlane_b32 s5, v1
+; GFX942-NEXT:    s_mul_i32 s4, s10, s0
+; GFX942-NEXT:    s_mul_hi_u32 s6, s10, s5
+; GFX942-NEXT:    s_mul_hi_u32 s1, s10, s0
+; GFX942-NEXT:    s_add_u32 s4, s6, s4
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    s_mul_hi_u32 s7, s11, s5
+; GFX942-NEXT:    s_mul_i32 s5, s11, s5
+; GFX942-NEXT:    s_add_u32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s6, s11, s0
+; GFX942-NEXT:    s_addc_u32 s1, s1, s7
+; GFX942-NEXT:    s_addc_u32 s4, s6, 0
+; GFX942-NEXT:    s_mul_i32 s0, s11, s0
+; GFX942-NEXT:    s_add_u32 s5, s1, s0
+; GFX942-NEXT:    s_addc_u32 s4, 0, s4
+; GFX942-NEXT:    s_mul_i32 s0, s2, s4
+; GFX942-NEXT:    s_mul_hi_u32 s1, s2, s5
+; GFX942-NEXT:    s_add_i32 s0, s1, s0
+; GFX942-NEXT:    s_mul_i32 s1, s3, s5
+; GFX942-NEXT:    s_add_i32 s6, s0, s1
+; GFX942-NEXT:    s_mul_i32 s1, s2, s5
+; GFX942-NEXT:    v_mov_b32_e32 v1, s1
+; GFX942-NEXT:    s_sub_i32 s0, s11, s6
+; GFX942-NEXT:    v_sub_co_u32_e32 v1, vcc, s10, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s7, s0, s3
+; GFX942-NEXT:    v_subrev_co_u32_e64 v2, s[0:1], s2, v1
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s7, s7, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s7, s3
+; GFX942-NEXT:    s_cselect_b32 s10, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[0:1], s2, v2
+; GFX942-NEXT:    s_cmp_eq_u32 s7, s3
+; GFX942-NEXT:    v_mov_b32_e32 v3, s10
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, -1, s[0:1]
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v3, v2, s[0:1]
+; GFX942-NEXT:    s_add_u32 s0, s5, 1
+; GFX942-NEXT:    s_addc_u32 s7, s4, 0
+; GFX942-NEXT:    s_add_u32 s1, s5, 2
+; GFX942-NEXT:    s_addc_u32 s10, s4, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s0
+; GFX942-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v2
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v3, v4, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v3, s7
+; GFX942-NEXT:    v_mov_b32_e32 v4, s10
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s11, s6
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s3
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s3
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, s1
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, s4
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v1, s5
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_udiv_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    s_mov_b32 s11, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[6:7], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX942-IR-NEXT:    s_flbit_i32_b64 s10, s[6:7]
+; GFX942-IR-NEXT:    s_or_b64 s[12:13], s[8:9], s[12:13]
+; GFX942-IR-NEXT:    s_flbit_i32_b64 s16, s[2:3]
+; GFX942-IR-NEXT:    s_sub_u32 s8, s10, s16
+; GFX942-IR-NEXT:    s_subb_u32 s9, 0, 0
+; GFX942-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[8:9], 63
+; GFX942-IR-NEXT:    s_or_b64 s[14:15], s[12:13], s[14:15]
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[8:9], 63
+; GFX942-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
+; GFX942-IR-NEXT:    s_and_b64 s[12:13], s[14:15], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s13, 0, s3
+; GFX942-IR-NEXT:    s_cselect_b32 s12, 0, s2
+; GFX942-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[18:19]
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB0_5
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    s_add_u32 s12, s8, 1
+; GFX942-IR-NEXT:    s_addc_u32 s13, s9, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[12:13], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX942-IR-NEXT:    s_sub_i32 s8, 63, s8
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
+; GFX942-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s8
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB0_4
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s12
+; GFX942-IR-NEXT:    s_add_u32 s14, s6, -1
+; GFX942-IR-NEXT:    s_addc_u32 s15, s7, -1
+; GFX942-IR-NEXT:    s_not_b64 s[2:3], s[10:11]
+; GFX942-IR-NEXT:    s_add_u32 s2, s2, s16
+; GFX942-IR-NEXT:    s_addc_u32 s3, s3, 0
+; GFX942-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GFX942-IR-NEXT:    s_mov_b32 s5, 0
+; GFX942-IR-NEXT:  .LBB0_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GFX942-IR-NEXT:    s_lshr_b32 s4, s9, 31
+; GFX942-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
+; GFX942-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[4:5]
+; GFX942-IR-NEXT:    s_or_b64 s[8:9], s[10:11], s[8:9]
+; GFX942-IR-NEXT:    s_sub_u32 s4, s14, s12
+; GFX942-IR-NEXT:    s_subb_u32 s4, s15, s13
+; GFX942-IR-NEXT:    s_ashr_i32 s10, s4, 31
+; GFX942-IR-NEXT:    s_mov_b32 s11, s10
+; GFX942-IR-NEXT:    s_and_b32 s4, s10, 1
+; GFX942-IR-NEXT:    s_and_b64 s[10:11], s[10:11], s[6:7]
+; GFX942-IR-NEXT:    s_sub_u32 s12, s12, s10
+; GFX942-IR-NEXT:    s_subb_u32 s13, s13, s11
+; GFX942-IR-NEXT:    s_add_u32 s2, s2, 1
+; GFX942-IR-NEXT:    s_addc_u32 s3, s3, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX942-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_scc0 .LBB0_3
+; GFX942-IR-NEXT:  .LBB0_4: ; %Flow7
+; GFX942-IR-NEXT:    s_lshl_b64 s[2:3], s[8:9], 1
+; GFX942-IR-NEXT:    s_or_b64 s[12:13], s[4:5], s[2:3]
+; GFX942-IR-NEXT:  .LBB0_5: ; %udiv-end
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[0:1], s[12:13]
+; GFX942-IR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %result = udiv i64 %x, %y
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -390,6 +599,192 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, v5
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v4
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_udiv_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; GFX942-NEXT:    v_sub_co_u32_e32 v13, vcc, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-NEXT:    v_fmamk_f32 v4, v5, 0x4f800000, v4
+; GFX942-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX942-NEXT:    v_subb_co_u32_e32 v14, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GFX942-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; GFX942-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-NEXT:    v_fmamk_f32 v4, v5, 0xcf800000, v4
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v12, v4
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v15, v5
+; GFX942-NEXT:    v_mul_lo_u32 v6, v14, v12
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v13, v12, 0
+; GFX942-NEXT:    v_mul_lo_u32 v7, v13, v15
+; GFX942-NEXT:    v_add3_u32 v5, v5, v7, v6
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v12, v5, 0
+; GFX942-NEXT:    v_mul_hi_u32 v8, v12, v4
+; GFX942-NEXT:    v_lshl_add_u64 v[6:7], v[8:9], 0, v[6:7]
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v15, v5, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v15, v4, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v8, vcc, v7, v5, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[10:11]
+; GFX942-NEXT:    v_add_co_u32_e32 v16, vcc, v12, v4
+; GFX942-NEXT:    v_mul_lo_u32 v7, v14, v16
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v5, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v6, v13, v15
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v13, v16, 0
+; GFX942-NEXT:    v_add3_u32 v5, v5, v6, v7
+; GFX942-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v16, v5, 0
+; GFX942-NEXT:    v_mul_hi_u32 v8, v16, v4
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v15, v5, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v15, v4, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[12:13]
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v8, vcc, v5, v11, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[6:7]
+; GFX942-NEXT:    v_add_co_u32_e32 v10, vcc, v16, v4
+; GFX942-NEXT:    v_mul_hi_u32 v8, v0, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v6, vcc, v15, v5, vcc
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, v6, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[4:5]
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v1, v10, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v10
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v1, v6, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v8, vcc, v5, v11, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[6:7]
+; GFX942-NEXT:    v_mul_lo_u32 v8, v3, v4
+; GFX942-NEXT:    v_mul_lo_u32 v9, v2, v5
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v2, v4, 0
+; GFX942-NEXT:    v_add3_u32 v10, v7, v9, v8
+; GFX942-NEXT:    v_sub_u32_e32 v7, v1, v10
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v6
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v6, s[0:1], v7, v3, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v7, s[0:1], v0, v2
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v10, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v6, s[0:1], 0, v6, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[0:1], v6, v3
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[0:1], v7, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], v6, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v11, v8, v7, s[0:1]
+; GFX942-NEXT:    v_lshl_add_u64 v[6:7], v[4:5], 0, 2
+; GFX942-NEXT:    v_lshl_add_u64 v[8:9], v[4:5], 0, 1
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v11
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, v8, v6, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v9, v7, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v8, v0, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v4, v6, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_udiv_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v4, v2
+; GFX942-IR-NEXT:    v_add_u32_e32 v4, 32, v4
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v5, v3
+; GFX942-IR-NEXT:    v_min_u32_e32 v10, v4, v5
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v4, v0
+; GFX942-IR-NEXT:    v_add_u32_e32 v4, 32, v4
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v5, v1
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[0:1]
+; GFX942-IR-NEXT:    v_min_u32_e32 v8, v4, v5
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v12, vcc, v10, v8
+; GFX942-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e64 v13, s[2:3], 0, 0, vcc
+; GFX942-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[12:13]
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[12:13]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v5, v1, 0, s[0:1]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v4, v0, 0, s[0:1]
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB1_6
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[14:15], v[12:13], 0, 1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v4, 63, v12
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[6:7], 0
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[4:5], v4, v[0:1]
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB1_5
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    v_mov_b32_e32 v7, -1
+; GFX942-IR-NEXT:    v_not_b32_e32 v6, v10
+; GFX942-IR-NEXT:    v_lshrrev_b64 v[12:13], v14, v[0:1]
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[0:1], v[2:3], 0, -1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[8:9], v[6:7], 0, v[8:9]
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[10:11], 0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-IR-NEXT:  .LBB1_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[12:13], 1, v[12:13]
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
+; GFX942-IR-NEXT:    v_or_b32_e32 v12, v12, v6
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v6, vcc, v0, v12
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v6, vcc, v1, v13, vcc
+; GFX942-IR-NEXT:    v_or_b32_e32 v4, v10, v4
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v6
+; GFX942-IR-NEXT:    v_or_b32_e32 v5, v11, v5
+; GFX942-IR-NEXT:    v_and_b32_e32 v6, 1, v10
+; GFX942-IR-NEXT:    v_and_b32_e32 v11, v10, v3
+; GFX942-IR-NEXT:    v_and_b32_e32 v10, v10, v2
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v12, vcc, v12, v10
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[8:9], v[8:9], 0, 1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v13, vcc, v13, v11, vcc
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[10:11], v[6:7]
+; GFX942-IR-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX942-IR-NEXT:  ; %bb.4: ; %Flow
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:  .LBB1_5: ; %Flow4
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[0:1], 1, v[4:5]
+; GFX942-IR-NEXT:    v_or_b32_e32 v5, v7, v1
+; GFX942-IR-NEXT:    v_or_b32_e32 v4, v6, v0
+; GFX942-IR-NEXT:  .LBB1_6: ; %Flow5
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v0, v4
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, v5
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 %x, %y
   ret i64 %result
 }
@@ -444,6 +839,52 @@ define amdgpu_kernel void @s_test_udiv24_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_udiv24_64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x38
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s2, s6, 8
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-NEXT:    s_lshr_b32 s2, s3, 8
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_udiv24_64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dword s6, s[4:5], 0x38
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_lshr_b32 s2, s6, 8
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-IR-NEXT:    s_lshr_b32 s2, s3, 8
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v4, v3
+; GFX942-IR-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = lshr i64 %x, 40
   %2 = lshr i64 %y, 40
   %result = udiv i64 %1, %2
@@ -487,6 +928,46 @@ define i64 @v_test_udiv24_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v3, vcc
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_udiv24_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX942-NEXT:    v_fma_f32 v1, -v2, v0, v1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_udiv24_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v3
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX942-IR-NEXT:    v_fma_f32 v1, -v2, v0, v1
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
+; GFX942-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %1 = lshr i64 %x, 40
   %2 = lshr i64 %y, 40
   %result = udiv i64 %1, %2
@@ -565,6 +1046,70 @@ define amdgpu_kernel void @s_test_udiv32_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_udiv32_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x38
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-NEXT:    s_sub_i32 s2, 0, s6
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    s_mul_i32 s2, s2, s4
+; GFX942-NEXT:    s_mul_hi_u32 s2, s4, s2
+; GFX942-NEXT:    s_add_i32 s4, s4, s2
+; GFX942-NEXT:    s_mul_hi_u32 s2, s3, s4
+; GFX942-NEXT:    s_mul_i32 s4, s2, s6
+; GFX942-NEXT:    s_sub_i32 s3, s3, s4
+; GFX942-NEXT:    s_add_i32 s5, s2, 1
+; GFX942-NEXT:    s_sub_i32 s4, s3, s6
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX942-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX942-NEXT:    s_add_i32 s4, s2, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX942-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_udiv32_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dword s6, s[4:5], 0x38
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-IR-NEXT:    s_sub_i32 s2, 0, s6
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-IR-NEXT:    s_mul_i32 s2, s2, s4
+; GFX942-IR-NEXT:    s_mul_hi_u32 s2, s4, s2
+; GFX942-IR-NEXT:    s_add_i32 s4, s4, s2
+; GFX942-IR-NEXT:    s_mul_hi_u32 s2, s3, s4
+; GFX942-IR-NEXT:    s_mul_i32 s4, s2, s6
+; GFX942-IR-NEXT:    s_sub_i32 s3, s3, s4
+; GFX942-IR-NEXT:    s_add_i32 s5, s2, 1
+; GFX942-IR-NEXT:    s_sub_i32 s4, s3, s6
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s5, s2
+; GFX942-IR-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX942-IR-NEXT:    s_add_i32 s4, s2, 1
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s3, s6
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = lshr i64 %x, 32
   %2 = lshr i64 %y, 32
   %result = udiv i64 %1, %2
@@ -648,6 +1193,74 @@ define amdgpu_kernel void @s_test_udiv31_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_udiv31_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x38
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s6, s0, 1
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s2, s3, 1
+; GFX942-NEXT:    s_sub_i32 s3, 0, s6
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    s_mul_i32 s3, s3, s4
+; GFX942-NEXT:    s_mul_hi_u32 s3, s4, s3
+; GFX942-NEXT:    s_add_i32 s4, s4, s3
+; GFX942-NEXT:    s_mul_hi_u32 s3, s2, s4
+; GFX942-NEXT:    s_mul_i32 s4, s3, s6
+; GFX942-NEXT:    s_sub_i32 s2, s2, s4
+; GFX942-NEXT:    s_add_i32 s5, s3, 1
+; GFX942-NEXT:    s_sub_i32 s4, s2, s6
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX942-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX942-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX942-NEXT:    s_add_i32 s4, s3, 1
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX942-NEXT:    s_cselect_b32 s2, s4, s3
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_udiv31_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dword s0, s[4:5], 0x38
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_lshr_b32 s6, s0, 1
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_lshr_b32 s2, s3, 1
+; GFX942-IR-NEXT:    s_sub_i32 s3, 0, s6
+; GFX942-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-IR-NEXT:    s_mul_i32 s3, s3, s4
+; GFX942-IR-NEXT:    s_mul_hi_u32 s3, s4, s3
+; GFX942-IR-NEXT:    s_add_i32 s4, s4, s3
+; GFX942-IR-NEXT:    s_mul_hi_u32 s3, s2, s4
+; GFX942-IR-NEXT:    s_mul_i32 s4, s3, s6
+; GFX942-IR-NEXT:    s_sub_i32 s2, s2, s4
+; GFX942-IR-NEXT:    s_add_i32 s5, s3, 1
+; GFX942-IR-NEXT:    s_sub_i32 s4, s2, s6
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX942-IR-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX942-IR-NEXT:    s_add_i32 s4, s3, 1
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s4, s3
+; GFX942-IR-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = lshr i64 %x, 33
   %2 = lshr i64 %y, 33
   %result = udiv i64 %1, %2
@@ -705,6 +1318,52 @@ define amdgpu_kernel void @s_test_udiv23_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_udiv23_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x38
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s2, s6, 9
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-NEXT:    s_lshr_b32 s2, s3, 9
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
+; GFX942-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_udiv23_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dword s6, s[4:5], 0x38
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_lshr_b32 s2, s6, 9
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-IR-NEXT:    s_lshr_b32 s2, s3, 9
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v4, v3
+; GFX942-IR-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-IR-NEXT:    v_and_b32_e32 v0, 0x7fffff, v0
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = lshr i64 %x, 41
   %2 = lshr i64 %y, 41
   %result = udiv i64 %1, %2
@@ -776,6 +1435,66 @@ define amdgpu_kernel void @s_test_udiv24_i48(ptr addrspace(1) %out, i48 %x, i48
 ; GCN-IR-NEXT:    buffer_store_short v3, off, s[4:7], 0 offset:4
 ; GCN-IR-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_udiv24_i48:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s2, s2, 0xff000000
+; GFX942-NEXT:    s_and_b32 s5, s6, 0xff000000
+; GFX942-NEXT:    s_and_b32 s4, s7, 0xffff
+; GFX942-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942-NEXT:    v_alignbit_b32 v0, s4, v0, 24
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX942-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    v_alignbit_b32 v1, s3, v1, 24
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GFX942-NEXT:    v_fma_f32 v1, -v2, v0, v1
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-NEXT:    global_store_short v3, v3, s[0:1] offset:4
+; GFX942-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_udiv24_i48:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_and_b32 s2, s2, 0xff000000
+; GFX942-IR-NEXT:    s_and_b32 s5, s6, 0xff000000
+; GFX942-IR-NEXT:    s_and_b32 s4, s7, 0xffff
+; GFX942-IR-NEXT:    v_mov_b32_e32 v0, s5
+; GFX942-IR-NEXT:    v_alignbit_b32 v0, s4, v0, 24
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX942-IR-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-IR-NEXT:    v_alignbit_b32 v1, s3, v1, 24
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, v1, v2
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v4, v2
+; GFX942-IR-NEXT:    v_fma_f32 v1, -v2, v0, v1
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-IR-NEXT:    global_store_short v3, v3, s[0:1] offset:4
+; GFX942-IR-NEXT:    global_store_dword v3, v0, s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = lshr i48 %x, 24
   %2 = lshr i48 %y, 24
   %result = udiv i48 %1, %2
@@ -954,6 +1673,195 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_udiv_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    s_mov_b32 s5, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX942-NEXT:    s_sub_u32 s4, 0, s2
+; GFX942-NEXT:    s_subb_u32 s6, 0, s3
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_fmamk_f32 v1, v2, 0xcf800000, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s7, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX942-NEXT:    s_mul_i32 s9, s4, s7
+; GFX942-NEXT:    s_mul_hi_u32 s11, s4, s8
+; GFX942-NEXT:    s_mul_i32 s10, s6, s8
+; GFX942-NEXT:    s_add_i32 s9, s11, s9
+; GFX942-NEXT:    s_mul_i32 s12, s4, s8
+; GFX942-NEXT:    s_add_i32 s9, s9, s10
+; GFX942-NEXT:    s_mul_hi_u32 s11, s8, s12
+; GFX942-NEXT:    s_mul_hi_u32 s10, s8, s9
+; GFX942-NEXT:    s_mul_i32 s8, s8, s9
+; GFX942-NEXT:    s_add_u32 s8, s11, s8
+; GFX942-NEXT:    s_addc_u32 s10, 0, s10
+; GFX942-NEXT:    s_mul_hi_u32 s11, s7, s12
+; GFX942-NEXT:    s_mul_i32 s12, s7, s12
+; GFX942-NEXT:    s_add_u32 s8, s8, s12
+; GFX942-NEXT:    s_mul_hi_u32 s13, s7, s9
+; GFX942-NEXT:    s_addc_u32 s8, s10, s11
+; GFX942-NEXT:    s_addc_u32 s10, s13, 0
+; GFX942-NEXT:    s_mul_i32 s9, s7, s9
+; GFX942-NEXT:    s_add_u32 s8, s8, s9
+; GFX942-NEXT:    s_addc_u32 s9, 0, s10
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s8, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s7, s7, s9
+; GFX942-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX942-NEXT:    s_mul_i32 s8, s4, s7
+; GFX942-NEXT:    s_mul_hi_u32 s10, s4, s9
+; GFX942-NEXT:    s_add_i32 s8, s10, s8
+; GFX942-NEXT:    s_mul_i32 s6, s6, s9
+; GFX942-NEXT:    s_add_i32 s8, s8, s6
+; GFX942-NEXT:    s_mul_i32 s4, s4, s9
+; GFX942-NEXT:    s_mul_hi_u32 s10, s7, s4
+; GFX942-NEXT:    s_mul_i32 s11, s7, s4
+; GFX942-NEXT:    s_mul_i32 s13, s9, s8
+; GFX942-NEXT:    s_mul_hi_u32 s4, s9, s4
+; GFX942-NEXT:    s_mul_hi_u32 s12, s9, s8
+; GFX942-NEXT:    s_add_u32 s4, s4, s13
+; GFX942-NEXT:    s_addc_u32 s9, 0, s12
+; GFX942-NEXT:    s_add_u32 s4, s4, s11
+; GFX942-NEXT:    s_mul_hi_u32 s6, s7, s8
+; GFX942-NEXT:    s_addc_u32 s4, s9, s10
+; GFX942-NEXT:    s_addc_u32 s6, s6, 0
+; GFX942-NEXT:    s_mul_i32 s8, s7, s8
+; GFX942-NEXT:    s_add_u32 s4, s4, s8
+; GFX942-NEXT:    s_addc_u32 s6, 0, s6
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s4, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s4, s7, s6
+; GFX942-NEXT:    v_readfirstlane_b32 s7, v1
+; GFX942-NEXT:    s_mul_hi_u32 s6, s4, 24
+; GFX942-NEXT:    s_mul_i32 s4, s4, 24
+; GFX942-NEXT:    s_mul_hi_u32 s7, s7, 24
+; GFX942-NEXT:    s_add_u32 s4, s7, s4
+; GFX942-NEXT:    s_addc_u32 s4, 0, s6
+; GFX942-NEXT:    s_mul_i32 s6, s3, s4
+; GFX942-NEXT:    s_mul_hi_u32 s7, s2, s4
+; GFX942-NEXT:    s_add_i32 s10, s7, s6
+; GFX942-NEXT:    s_mul_i32 s6, s2, s4
+; GFX942-NEXT:    s_sub_i32 s8, 0, s10
+; GFX942-NEXT:    v_sub_co_u32_e64 v1, s[6:7], 24, s6
+; GFX942-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX942-NEXT:    s_subb_u32 s8, s8, s3
+; GFX942-NEXT:    v_subrev_co_u32_e32 v2, vcc, s2, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s8, s8, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s8, s3
+; GFX942-NEXT:    s_cselect_b32 s9, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s2, v2
+; GFX942-NEXT:    s_cmp_eq_u32 s8, s3
+; GFX942-NEXT:    v_mov_b32_e32 v3, s9
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    s_or_b64 s[8:9], s[4:5], 0
+; GFX942-NEXT:    s_add_u32 s5, s4, 1
+; GFX942-NEXT:    s_addc_u32 s8, s9, 0
+; GFX942-NEXT:    s_add_u32 s11, s4, 2
+; GFX942-NEXT:    s_addc_u32 s12, s9, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc
+; GFX942-NEXT:    s_cmp_lg_u64 s[6:7], 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s5
+; GFX942-NEXT:    v_mov_b32_e32 v4, s11
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    s_subb_u32 s5, 0, s10
+; GFX942-NEXT:    s_cmp_ge_u32 s5, s3
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v3, v4, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v3, s8
+; GFX942-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX942-NEXT:    s_cselect_b32 s6, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s2, v1
+; GFX942-NEXT:    s_cmp_eq_u32 s5, s3
+; GFX942-NEXT:    v_mov_b32_e32 v4, s6
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, 0, -1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, s9
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v1
+; GFX942-NEXT:    v_mov_b32_e32 v1, s4
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v4, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_udiv_k_num_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX942-IR-NEXT:    s_flbit_i32_b64 s12, s[2:3]
+; GFX942-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX942-IR-NEXT:    s_add_u32 s6, s12, 0xffffffc5
+; GFX942-IR-NEXT:    s_addc_u32 s7, 0, -1
+; GFX942-IR-NEXT:    v_cmp_gt_u64_e64 s[10:11], s[6:7], 63
+; GFX942-IR-NEXT:    s_or_b64 s[10:11], s[8:9], s[10:11]
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[6:7], 63
+; GFX942-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX942-IR-NEXT:    s_and_b64 s[8:9], s[10:11], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s8, 0, 24
+; GFX942-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
+; GFX942-IR-NEXT:    s_mov_b32 s9, 0
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB8_5
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    s_add_u32 s8, s6, 1
+; GFX942-IR-NEXT:    s_addc_u32 s9, s7, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[8:9], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX942-IR-NEXT:    s_sub_i32 s6, 63, s6
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
+; GFX942-IR-NEXT:    s_lshl_b64 s[6:7], 24, s6
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB8_4
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    s_lshr_b64 s[10:11], 24, s8
+; GFX942-IR-NEXT:    s_add_u32 s14, s2, -1
+; GFX942-IR-NEXT:    s_addc_u32 s15, s3, -1
+; GFX942-IR-NEXT:    s_sub_u32 s8, 58, s12
+; GFX942-IR-NEXT:    s_subb_u32 s9, 0, 0
+; GFX942-IR-NEXT:    s_mov_b64 s[12:13], 0
+; GFX942-IR-NEXT:    s_mov_b32 s5, 0
+; GFX942-IR-NEXT:  .LBB8_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
+; GFX942-IR-NEXT:    s_lshr_b32 s4, s7, 31
+; GFX942-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX942-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[4:5]
+; GFX942-IR-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
+; GFX942-IR-NEXT:    s_sub_u32 s4, s14, s10
+; GFX942-IR-NEXT:    s_subb_u32 s4, s15, s11
+; GFX942-IR-NEXT:    s_ashr_i32 s12, s4, 31
+; GFX942-IR-NEXT:    s_mov_b32 s13, s12
+; GFX942-IR-NEXT:    s_and_b32 s4, s12, 1
+; GFX942-IR-NEXT:    s_and_b64 s[12:13], s[12:13], s[2:3]
+; GFX942-IR-NEXT:    s_sub_u32 s10, s10, s12
+; GFX942-IR-NEXT:    s_subb_u32 s11, s11, s13
+; GFX942-IR-NEXT:    s_add_u32 s8, s8, 1
+; GFX942-IR-NEXT:    s_addc_u32 s9, s9, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[8:9], 0
+; GFX942-IR-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_scc0 .LBB8_3
+; GFX942-IR-NEXT:  .LBB8_4: ; %Flow6
+; GFX942-IR-NEXT:    s_lshl_b64 s[2:3], s[6:7], 1
+; GFX942-IR-NEXT:    s_or_b64 s[8:9], s[4:5], s[2:3]
+; GFX942-IR-NEXT:  .LBB8_5: ; %udiv-end
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-IR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %result = udiv i64 24, %x
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -1137,6 +2045,178 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_udiv_pow2_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, v0
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, v1
+; GFX942-NEXT:    v_sub_co_u32_e32 v11, vcc, 0, v0
+; GFX942-NEXT:    v_fmamk_f32 v2, v3, 0x4f800000, v2
+; GFX942-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX942-NEXT:    v_subb_co_u32_e32 v12, vcc, 0, v1, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX942-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fmamk_f32 v2, v3, 0xcf800000, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v10, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v13, v3
+; GFX942-NEXT:    v_mul_lo_u32 v2, v12, v10
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v11, v10, 0
+; GFX942-NEXT:    v_mul_lo_u32 v3, v11, v13
+; GFX942-NEXT:    v_add3_u32 v5, v5, v3, v2
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v10, v5, 0
+; GFX942-NEXT:    v_mul_hi_u32 v2, v10, v4
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[6:7], v[2:3], 0, v[6:7]
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v13, v5, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v13, v4, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v6, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v2, vcc, v7, v5, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[2:3], 0, v[8:9]
+; GFX942-NEXT:    v_add_co_u32_e32 v14, vcc, v10, v4
+; GFX942-NEXT:    v_mul_lo_u32 v6, v12, v14
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v5, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v2, v11, v13
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v11, v14, 0
+; GFX942-NEXT:    v_add3_u32 v2, v5, v2, v6
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v13, v2, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v14, v2, 0
+; GFX942-NEXT:    v_mul_hi_u32 v2, v14, v4
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v13, v4, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[2:3], 0, v[10:11]
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v8
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v2, vcc, v5, v9, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[2:3], 0, v[6:7]
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v14, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v2, vcc, v13, v5, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v2, 17, v2
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v6, v5
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v1, v2, v[6:7]
+; GFX942-NEXT:    v_sub_u32_e32 v5, 0, v6
+; GFX942-NEXT:    v_sub_co_u32_e32 v7, vcc, 0x8000, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v5, v1, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v5, s[0:1], v7, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[0:1], v4, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[0:1], v5, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, 0, -1, s[0:1]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[0:1], v4, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v10, v8, v5, s[0:1]
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[2:3], 0, 2
+; GFX942-NEXT:    v_lshl_add_u64 v[8:9], v[2:3], 0, 1
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v8, v4, s[0:1]
+; GFX942-NEXT:    v_subb_co_u32_e32 v4, vcc, 0, v6, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v7, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v9, v5, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v6, v0, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_udiv_pow2_k_num_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX942-IR-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX942-IR-NEXT:    s_movk_i32 s0, 0xffd0
+; GFX942-IR-NEXT:    v_min_u32_e32 v8, v2, v3
+; GFX942-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-IR-NEXT:    s_mov_b32 s1, -1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[6:7], v[8:9], 0, s[0:1]
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], 63, v[6:7]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0x8000
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s[0:1]
+; GFX942-IR-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[6:7]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, v9
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[0:1], vcc
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB9_6
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[10:11], v[6:7], 0, 1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v2, 63, v6
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[4:5], 0
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], v2, s[4:5]
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB9_5
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v8, vcc, 47, v8
+; GFX942-IR-NEXT:    v_lshrrev_b64 v[10:11], v10, s[4:5]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e64 v9, s[4:5], 0, 0, vcc
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, -1
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[12:13], 0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-IR-NEXT:  .LBB9_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[10:11], 1, v[10:11]
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
+; GFX942-IR-NEXT:    v_or_b32_e32 v10, v10, v4
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v4, vcc, v6, v10
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v4, vcc, v7, v11, vcc
+; GFX942-IR-NEXT:    v_or_b32_e32 v2, v12, v2
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v4
+; GFX942-IR-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX942-IR-NEXT:    v_and_b32_e32 v4, 1, v12
+; GFX942-IR-NEXT:    v_and_b32_e32 v13, v12, v1
+; GFX942-IR-NEXT:    v_and_b32_e32 v12, v12, v0
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v10, vcc, v10, v12
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[8:9], v[8:9], 0, 1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v13, vcc
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-IR-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_execnz .LBB9_3
+; GFX942-IR-NEXT:  ; %bb.4: ; %Flow
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:  .LBB9_5: ; %Flow4
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[0:1], 1, v[2:3]
+; GFX942-IR-NEXT:    v_or_b32_e32 v3, v5, v1
+; GFX942-IR-NEXT:    v_or_b32_e32 v2, v4, v0
+; GFX942-IR-NEXT:  .LBB9_6: ; %Flow5
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 32768, %x
   ret i64 %result
 }
@@ -1223,6 +2303,88 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_udiv_pow2_k_den_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_alignbit_b32 v0, v1, v0, 15
+; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 15, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_udiv_pow2_k_den_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX942-IR-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX942-IR-NEXT:    v_min_u32_e32 v6, v2, v3
+; GFX942-IR-NEXT:    v_sub_co_u32_e64 v8, s[0:1], 48, v6
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e64 v9, s[0:1], 0, 0, s[0:1]
+; GFX942-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], 63, v[8:9]
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[8:9]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v3, v1, 0, s[0:1]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[0:1]
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB10_6
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[10:11], v[8:9], 0, 1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v2, 63, v8
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[4:5], 0
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], v2, v[0:1]
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB10_5
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    s_movk_i32 s4, 0xffcf
+; GFX942-IR-NEXT:    s_mov_b32 s5, -1
+; GFX942-IR-NEXT:    v_lshrrev_b64 v[8:9], v10, v[0:1]
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[0:1], v[6:7], 0, s[4:5]
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[6:7], 0
+; GFX942-IR-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX942-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-IR-NEXT:  .LBB10_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
+; GFX942-IR-NEXT:    v_or_b32_e32 v8, v8, v4
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v4, vcc, s6, v8
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v4, vcc, 0, v9, vcc
+; GFX942-IR-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v4
+; GFX942-IR-NEXT:    v_and_b32_e32 v4, 1, v6
+; GFX942-IR-NEXT:    v_and_b32_e32 v6, 0x8000, v6
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v8, vcc, v8, v6
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-IR-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-IR-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_execnz .LBB10_3
+; GFX942-IR-NEXT:  ; %bb.4: ; %Flow
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:  .LBB10_5: ; %Flow4
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[0:1], 1, v[2:3]
+; GFX942-IR-NEXT:    v_or_b32_e32 v3, v5, v1
+; GFX942-IR-NEXT:    v_or_b32_e32 v2, v4, v0
+; GFX942-IR-NEXT:  .LBB10_6: ; %Flow5
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 %x, 32768
   ret i64 %result
 }
@@ -1319,6 +2481,94 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, s7
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_udiv_k_den_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_hi_u32 s4, s2, 0xaaaaaaaa
+; GFX942-NEXT:    s_mul_i32 s5, s2, 0xaaaaaaaa
+; GFX942-NEXT:    s_mul_i32 s7, s3, 0xaaaaaaab
+; GFX942-NEXT:    s_mul_hi_u32 s2, s2, 0xaaaaaaab
+; GFX942-NEXT:    s_mul_hi_u32 s6, s3, 0xaaaaaaab
+; GFX942-NEXT:    s_add_u32 s2, s7, s2
+; GFX942-NEXT:    s_addc_u32 s6, s6, 0
+; GFX942-NEXT:    s_add_u32 s2, s5, s2
+; GFX942-NEXT:    s_addc_u32 s2, s4, 0
+; GFX942-NEXT:    s_add_u32 s2, s6, s2
+; GFX942-NEXT:    s_addc_u32 s4, 0, 0
+; GFX942-NEXT:    s_mul_hi_u32 s5, s3, 0xaaaaaaaa
+; GFX942-NEXT:    s_mul_i32 s3, s3, 0xaaaaaaaa
+; GFX942-NEXT:    s_add_u32 s2, s3, s2
+; GFX942-NEXT:    s_addc_u32 s3, s5, s4
+; GFX942-NEXT:    s_lshr_b64 s[2:3], s[2:3], 4
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_udiv_k_den_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX942-IR-NEXT:    s_flbit_i32_b64 s10, s[2:3]
+; GFX942-IR-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX942-IR-NEXT:    s_sub_u32 s6, 59, s10
+; GFX942-IR-NEXT:    s_subb_u32 s7, 0, 0
+; GFX942-IR-NEXT:    v_cmp_gt_u64_e64 s[8:9], s[6:7], 63
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[8:9]
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[6:7], 63
+; GFX942-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX942-IR-NEXT:    s_and_b64 s[8:9], s[4:5], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s9, 0, s3
+; GFX942-IR-NEXT:    s_cselect_b32 s8, 0, s2
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[12:13]
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB11_5
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    s_add_u32 s8, s6, 1
+; GFX942-IR-NEXT:    s_addc_u32 s9, s7, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[8:9], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX942-IR-NEXT:    s_sub_i32 s6, 63, s6
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[12:13]
+; GFX942-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], s6
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB11_4
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    s_lshr_b64 s[8:9], s[2:3], s8
+; GFX942-IR-NEXT:    s_add_u32 s2, s10, 0xffffffc4
+; GFX942-IR-NEXT:    s_addc_u32 s3, 0, -1
+; GFX942-IR-NEXT:    s_mov_b64 s[10:11], 0
+; GFX942-IR-NEXT:    s_mov_b32 s5, 0
+; GFX942-IR-NEXT:  .LBB11_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
+; GFX942-IR-NEXT:    s_lshr_b32 s4, s7, 31
+; GFX942-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX942-IR-NEXT:    s_or_b64 s[8:9], s[8:9], s[4:5]
+; GFX942-IR-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX942-IR-NEXT:    s_sub_u32 s4, 23, s8
+; GFX942-IR-NEXT:    s_subb_u32 s4, 0, s9
+; GFX942-IR-NEXT:    s_ashr_i32 s10, s4, 31
+; GFX942-IR-NEXT:    s_and_b32 s4, s10, 1
+; GFX942-IR-NEXT:    s_and_b32 s10, s10, 24
+; GFX942-IR-NEXT:    s_sub_u32 s8, s8, s10
+; GFX942-IR-NEXT:    s_subb_u32 s9, s9, 0
+; GFX942-IR-NEXT:    s_add_u32 s2, s2, 1
+; GFX942-IR-NEXT:    s_addc_u32 s3, s3, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX942-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_scc0 .LBB11_3
+; GFX942-IR-NEXT:  .LBB11_4: ; %Flow6
+; GFX942-IR-NEXT:    s_lshl_b64 s[2:3], s[6:7], 1
+; GFX942-IR-NEXT:    s_or_b64 s[8:9], s[4:5], s[2:3]
+; GFX942-IR-NEXT:  .LBB11_5: ; %udiv-end
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[0:1], s[8:9]
+; GFX942-IR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %result = udiv i64 %x, 24
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -1422,6 +2672,99 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, v3
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, v2
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_udiv_k_den_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s0, 0xaaaaaaab
+; GFX942-NEXT:    v_mul_hi_u32 v2, v0, s0
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, s0, v[2:3]
+; GFX942-NEXT:    v_mov_b32_e32 v2, v5
+; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    s_mov_b32 s2, 0xaaaaaaaa
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, s2, v[4:5]
+; GFX942-NEXT:    v_mov_b32_e32 v4, v5
+; GFX942-NEXT:    v_mov_b32_e32 v5, v3
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 0, v[4:5]
+; GFX942-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v1, s2, v[2:3]
+; GFX942-NEXT:    v_alignbit_b32 v0, v1, v0, 4
+; GFX942-NEXT:    v_lshrrev_b32_e32 v1, 4, v1
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_udiv_k_den_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX942-IR-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX942-IR-NEXT:    v_min_u32_e32 v6, v2, v3
+; GFX942-IR-NEXT:    v_sub_co_u32_e64 v8, s[0:1], 59, v6
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e64 v9, s[0:1], 0, 0, s[0:1]
+; GFX942-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], 63, v[8:9]
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[8:9]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v3, v1, 0, s[0:1]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v2, v0, 0, s[0:1]
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB12_6
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[10:11], v[8:9], 0, 1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v2, 63, v8
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[4:5], 0
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], v2, v[0:1]
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB12_5
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    s_movk_i32 s4, 0xffc4
+; GFX942-IR-NEXT:    s_mov_b32 s5, -1
+; GFX942-IR-NEXT:    v_lshrrev_b64 v[8:9], v10, v[0:1]
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[0:1], v[6:7], 0, s[4:5]
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[6:7], 0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-IR-NEXT:  .LBB12_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
+; GFX942-IR-NEXT:    v_or_b32_e32 v8, v8, v4
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v4, vcc, 23, v8
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v4, vcc, 0, v9, vcc
+; GFX942-IR-NEXT:    v_or_b32_e32 v2, v6, v2
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v6, 31, v4
+; GFX942-IR-NEXT:    v_and_b32_e32 v4, 1, v6
+; GFX942-IR-NEXT:    v_and_b32_e32 v6, 24, v6
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v8, vcc, v8, v6
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 0, 1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-IR-NEXT:    v_or_b32_e32 v3, v7, v3
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[6:7], v[4:5]
+; GFX942-IR-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_execnz .LBB12_3
+; GFX942-IR-NEXT:  ; %bb.4: ; %Flow
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:  .LBB12_5: ; %Flow4
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[0:1], 1, v[2:3]
+; GFX942-IR-NEXT:    v_or_b32_e32 v3, v5, v1
+; GFX942-IR-NEXT:    v_or_b32_e32 v2, v4, v0
+; GFX942-IR-NEXT:  .LBB12_6: ; %Flow5
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v0, v2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, v3
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = udiv i64 %x, 24
   ret i64 %result
 }
@@ -1468,6 +2811,48 @@ define amdgpu_kernel void @s_test_udiv24_k_num_i64(ptr addrspace(1) %out, i64 %x
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_udiv24_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s2, s3, 8
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-NEXT:    s_mov_b32 s2, 0x41c00000
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX942-NEXT:    v_fma_f32 v2, -v2, v0, s2
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_udiv24_k_num_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_lshr_b32 s2, s3, 8
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-IR-NEXT:    s_mov_b32 s2, 0x41c00000
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX942-IR-NEXT:    v_fma_f32 v2, -v2, v0, s2
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
+; GFX942-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %x.shr = lshr i64 %x, 40
   %result = udiv i64 24, %x.shr
   store i64 %result, ptr addrspace(1) %out
@@ -1518,6 +2903,44 @@ define amdgpu_kernel void @s_test_udiv24_k_den_i64(ptr addrspace(1) %out, i64 %x
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_udiv24_k_den_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s2, s3, 8
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-NEXT:    s_mov_b32 s2, 0x46b6fe00
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x38331158, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_fmamk_f32 v0, v2, 0xc6b6fe00, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_udiv24_k_den_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_lshr_b32 s2, s3, 8
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-IR-NEXT:    s_mov_b32 s2, 0x46b6fe00
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, 0x38331158, v0
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_fmamk_f32 v0, v2, 0xc6b6fe00, v0
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s2
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
+; GFX942-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %x.shr = lshr i64 %x, 40
   %result = udiv i64 %x.shr, 23423
   store i64 %result, ptr addrspace(1) %out
@@ -1558,6 +2981,44 @@ define i64 @v_test_udiv24_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_udiv24_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX942-NEXT:    s_mov_b32 s0, 0x41c00000
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX942-NEXT:    v_fma_f32 v1, -v1, v0, s0
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_udiv24_k_num_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX942-IR-NEXT:    s_mov_b32 s0, 0x41c00000
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v1, 0x41c00000, v1
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX942-IR-NEXT:    v_fma_f32 v1, -v1, v0, s0
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
+; GFX942-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = udiv i64 24, %x.shr
   ret i64 %result
@@ -1597,6 +3058,44 @@ define i64 @v_test_udiv24_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_udiv24_pow2_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX942-NEXT:    s_mov_b32 s0, 0x47000000
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x47000000, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX942-NEXT:    v_fma_f32 v1, -v1, v0, s0
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_udiv24_pow2_k_num_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX942-IR-NEXT:    s_mov_b32 s0, 0x47000000
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v1, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v1, 0x47000000, v1
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX942-IR-NEXT:    v_fma_f32 v1, -v1, v0, s0
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
+; GFX942-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = udiv i64 32768, %x.shr
   ret i64 %result
@@ -1625,6 +3124,30 @@ define i64 @v_test_udiv24_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_addc_u32_e32 v0, vcc, 0, v2, vcc
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_udiv24_pow2_k_den_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 23, v1
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_udiv24_pow2_k_den_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX942-IR-NEXT:    s_mov_b32 s0, 0x47000000
+; GFX942-IR-NEXT:    v_mul_f32_e32 v1, 0x38000000, v0
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v2, v1
+; GFX942-IR-NEXT:    v_fmamk_f32 v0, v1, 0xc7000000, v0
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v2, vcc
+; GFX942-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = udiv i64 %x.shr, 32768
   ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index ab278c3b63a3e..983acfc2c0699 100644
--- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii < %s | FileCheck -enable-var-scope  -check-prefixes=GCN,SI %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
@@ -48,6 +49,22 @@ define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, ptr ad
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: v_uint_to_fp_i64_to_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    global_load_dwordx2 v[0:1], v0, s[2:3]
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[2:3], v1
+; GFX942-NEXT:    v_ldexp_f64 v[2:3], v[2:3], 32
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX942-NEXT:    v_add_f64 v[0:1], v[2:3], v[0:1]
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr i64, ptr addrspace(1) %in, i32 %tid
   %val = load i64, ptr addrspace(1) %gep, align 8
@@ -88,6 +105,18 @@ define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(ptr addrspace(1) %out, i64 %i
 ; VI-NEXT:    v_mov_b32_e32 v3, s1
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_uint_to_fp_i64_to_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
+; GFX942-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 32
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], v[2:3]
+; GFX942-NEXT:    global_store_dwordx2 v4, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %cast = uitofp i64 %in to double
   store double %cast, ptr addrspace(1) %out, align 8
   ret void
@@ -136,6 +165,23 @@ define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(ptr addrspace(1) %out, <2
 ; VI-NEXT:    v_mov_b32_e32 v4, s0
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_uint_to_fp_v2i64_to_v2f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[0:1], s3
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[2:3], s2
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[4:5], s1
+; GFX942-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 32
+; GFX942-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
+; GFX942-NEXT:    v_ldexp_f64 v[0:1], v[4:5], 32
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[4:5], s0
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
+; GFX942-NEXT:    global_store_dwordx4 v6, v[0:3], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %cast = uitofp <2 x i64> %in to <2 x double>
   store <2 x double> %cast, ptr addrspace(1) %out, align 16
   ret void
@@ -210,6 +256,32 @@ define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(ptr addrspace(1) %out, <4
 ; VI-NEXT:    flat_store_dwordx4 v[10:11], v[4:7]
 ; VI-NEXT:    flat_store_dwordx4 v[8:9], v[0:3]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_uint_to_fp_v4i64_to_v4f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x20
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v10, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[0:1], s11
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[2:3], s10
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[4:5], s9
+; GFX942-NEXT:    v_ldexp_f64 v[0:1], v[0:1], 32
+; GFX942-NEXT:    v_add_f64 v[2:3], v[0:1], v[2:3]
+; GFX942-NEXT:    v_ldexp_f64 v[0:1], v[4:5], 32
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[4:5], s8
+; GFX942-NEXT:    v_add_f64 v[0:1], v[0:1], v[4:5]
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[4:5], s15
+; GFX942-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 32
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[6:7], s14
+; GFX942-NEXT:    v_add_f64 v[6:7], v[4:5], v[6:7]
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[4:5], s13
+; GFX942-NEXT:    v_ldexp_f64 v[4:5], v[4:5], 32
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[8:9], s12
+; GFX942-NEXT:    v_add_f64 v[4:5], v[4:5], v[8:9]
+; GFX942-NEXT:    global_store_dwordx4 v10, v[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v10, v[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %cast = uitofp <4 x i64> %in to <4 x double>
   store <4 x double> %cast, ptr addrspace(1) %out, align 16
   ret void
@@ -243,6 +315,16 @@ define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(ptr addrspace(1) %out, i32 %i
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_uint_to_fp_i32_to_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %cast = uitofp i32 %in to double
   store double %cast, ptr addrspace(1) %out, align 8
   ret void
@@ -262,6 +344,16 @@ define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(ptr addrspace(1) %out, <2
 ; GCN-NEXT:    v_mov_b32_e32 v4, s0
 ; GCN-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_uint_to_fp_v2i32_to_v2f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[2:3], s3
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
+; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %cast = uitofp <2 x i32> %in to <2 x double>
   store <2 x double> %cast, ptr addrspace(1) %out, align 16
   ret void
@@ -313,6 +405,20 @@ define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(ptr addrspace(1) %out, <4
 ; VI-NEXT:    v_mov_b32_e32 v5, s5
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_uint_to_fp_v4i32_to_v4f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x10
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v8, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[6:7], s3
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[4:5], s2
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[2:3], s1
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[0:1], s0
+; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[6:7] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[6:7]
+; GFX942-NEXT:    s_endpgm
   %cast = uitofp <4 x i32> %in to <4 x double>
   store <4 x double> %cast, ptr addrspace(1) %out, align 16
   ret void
@@ -354,6 +460,18 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64(ptr addrspace(1) %out, i32 %in)
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: uint_to_fp_i1_to_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to double
   store double %fp, ptr addrspace(1) %out, align 4
@@ -394,6 +512,19 @@ define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(ptr addrspace(1) %out, i1 %
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: uint_to_fp_i1_to_f64_load:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_bitcmp1_b32 s2, 0
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %fp = uitofp i1 %in to double
   store double %fp, ptr addrspace(1) %out, align 8
   ret void
@@ -429,6 +560,17 @@ define amdgpu_kernel void @s_uint_to_fp_i8_to_f64(ptr addrspace(1) %out, i8 %in)
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_uint_to_fp_i8_to_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[0:1], s2
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %fp = uitofp i8 %in to double
   store double %fp, ptr addrspace(1) %out
   ret void
@@ -450,6 +592,14 @@ define double @v_uint_to_fp_i8_to_f64(i8 %in) {
 ; VI-NEXT:    v_and_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_uint_to_fp_i8_to_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    s_mov_b32 s0, 0xffff
+; GFX942-NEXT:    v_and_b32_sdwa v0, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX942-NEXT:    v_cvt_f64_u32_e32 v[0:1], v0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %fp = uitofp i8 %in to double
   ret double %fp
 }
@@ -488,6 +638,18 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_select_uint_to_fp_i1_vals_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, double 1.0, double 0.0
   store double %select, ptr addrspace(1) %out, align 8
@@ -505,6 +667,18 @@ define void @v_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in) {
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_select_uint_to_fp_i1_vals_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x3ff00000
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
+; GFX942-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, double 1.0, double 0.0
   store double %select, ptr addrspace(1) %out, align 8
@@ -545,6 +719,18 @@ define amdgpu_kernel void @s_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_select_uint_to_fp_i1_vals_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX942-NEXT:    s_cselect_b32 s2, 0x3ff00000, 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, i64 u0x3ff0000000000000, i64 0
   store i64 %select, ptr addrspace(1) %out, align 8
@@ -562,6 +748,18 @@ define void @v_select_uint_to_fp_i1_vals_i64(ptr addrspace(1) %out, i32 %in) {
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_select_uint_to_fp_i1_vals_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x3ff00000
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v5, 0, v3, vcc
+; GFX942-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, i64 u0x3ff0000000000000, i64 0
   store i64 %select, ptr addrspace(1) %out, align 8
@@ -603,6 +801,18 @@ define amdgpu_kernel void @s_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1)
 ; VI-NEXT:    v_mov_b32_e32 v2, s0
 ; VI-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
 ; VI-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_swap_select_uint_to_fp_i1_vals_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x8
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX942-NEXT:    s_cselect_b32 s2, 0, 0x3ff00000
+; GFX942-NEXT:    v_mov_b32_e32 v1, s2
+; GFX942-NEXT:    global_store_dwordx2 v0, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, double 0.0, double 1.0
   store double %select, ptr addrspace(1) %out, align 8
@@ -620,6 +830,18 @@ define void @v_swap_select_uint_to_fp_i1_vals_f64(ptr addrspace(1) %out, i32 %in
 ; GCN-NEXT:    flat_store_dwordx2 v[0:1], v[3:4]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_swap_select_uint_to_fp_i1_vals_f64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0x3ff00000
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, v3, 0, vcc
+; GFX942-NEXT:    global_store_dwordx2 v[0:1], v[4:5], off
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq i32 %in, 0
   %select = select i1 %cmp, double 0.0, double 1.0
   store double %select, ptr addrspace(1) %out, align 8
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index 464dad83f47c9..d3149991a7e6f 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GCN-IR %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-bypass-slow-div=0 < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -amdgpu-bypass-slow-div=0 -amdgpu-codegenprepare-expand-div64 < %s | FileCheck -check-prefix=GFX942-IR %s
 
 define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-LABEL: s_test_urem_i64:
@@ -202,6 +204,219 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[12:15], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_urem_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s7
+; GFX942-NEXT:    s_sub_u32 s0, 0, s6
+; GFX942-NEXT:    s_subb_u32 s1, 0, s7
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v1, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x5f7ffffc, v1
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x2f800000, v1
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_fmamk_f32 v1, v2, 0xcf800000, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX942-NEXT:    s_mul_i32 s4, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s12, s0, s3
+; GFX942-NEXT:    s_mul_i32 s5, s1, s3
+; GFX942-NEXT:    s_add_i32 s4, s12, s4
+; GFX942-NEXT:    s_mul_i32 s13, s0, s3
+; GFX942-NEXT:    s_add_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s12, s3, s13
+; GFX942-NEXT:    s_mul_hi_u32 s5, s3, s4
+; GFX942-NEXT:    s_mul_i32 s3, s3, s4
+; GFX942-NEXT:    s_add_u32 s3, s12, s3
+; GFX942-NEXT:    s_addc_u32 s5, 0, s5
+; GFX942-NEXT:    s_mul_hi_u32 s12, s2, s13
+; GFX942-NEXT:    s_mul_i32 s13, s2, s13
+; GFX942-NEXT:    s_add_u32 s3, s3, s13
+; GFX942-NEXT:    s_mul_hi_u32 s14, s2, s4
+; GFX942-NEXT:    s_addc_u32 s3, s5, s12
+; GFX942-NEXT:    s_addc_u32 s5, s14, 0
+; GFX942-NEXT:    s_mul_i32 s4, s2, s4
+; GFX942-NEXT:    s_add_u32 s3, s3, s4
+; GFX942-NEXT:    s_addc_u32 s4, 0, s5
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s3, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s2, s4
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX942-NEXT:    s_mul_i32 s3, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s5, s0, s4
+; GFX942-NEXT:    s_add_i32 s3, s5, s3
+; GFX942-NEXT:    s_mul_i32 s1, s1, s4
+; GFX942-NEXT:    s_add_i32 s3, s3, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s4
+; GFX942-NEXT:    s_mul_hi_u32 s5, s2, s0
+; GFX942-NEXT:    s_mul_i32 s12, s2, s0
+; GFX942-NEXT:    s_mul_i32 s14, s4, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s4, s0
+; GFX942-NEXT:    s_mul_hi_u32 s13, s4, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s14
+; GFX942-NEXT:    s_addc_u32 s4, 0, s13
+; GFX942-NEXT:    s_add_u32 s0, s0, s12
+; GFX942-NEXT:    s_mul_hi_u32 s1, s2, s3
+; GFX942-NEXT:    s_addc_u32 s0, s4, s5
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s3, s2, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s3
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v1, vcc, s0, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s0, s2, s1
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX942-NEXT:    s_mul_i32 s2, s10, s0
+; GFX942-NEXT:    s_mul_hi_u32 s4, s10, s3
+; GFX942-NEXT:    s_mul_hi_u32 s1, s10, s0
+; GFX942-NEXT:    s_add_u32 s2, s4, s2
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    s_mul_hi_u32 s5, s11, s3
+; GFX942-NEXT:    s_mul_i32 s3, s11, s3
+; GFX942-NEXT:    s_add_u32 s2, s2, s3
+; GFX942-NEXT:    s_mul_hi_u32 s4, s11, s0
+; GFX942-NEXT:    s_addc_u32 s1, s1, s5
+; GFX942-NEXT:    s_addc_u32 s2, s4, 0
+; GFX942-NEXT:    s_mul_i32 s0, s11, s0
+; GFX942-NEXT:    s_add_u32 s0, s1, s0
+; GFX942-NEXT:    s_addc_u32 s1, 0, s2
+; GFX942-NEXT:    s_mul_i32 s1, s6, s1
+; GFX942-NEXT:    s_mul_hi_u32 s2, s6, s0
+; GFX942-NEXT:    s_add_i32 s1, s2, s1
+; GFX942-NEXT:    s_mul_i32 s2, s7, s0
+; GFX942-NEXT:    s_mul_i32 s0, s6, s0
+; GFX942-NEXT:    s_add_i32 s4, s1, s2
+; GFX942-NEXT:    v_mov_b32_e32 v1, s0
+; GFX942-NEXT:    s_sub_i32 s1, s11, s4
+; GFX942-NEXT:    v_sub_co_u32_e32 v1, vcc, s10, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s5, s1, s7
+; GFX942-NEXT:    v_subrev_co_u32_e64 v2, s[0:1], s6, v1
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s10, s5, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s10, s7
+; GFX942-NEXT:    s_cselect_b32 s12, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[2:3], s6, v2
+; GFX942-NEXT:    s_cmp_eq_u32 s10, s7
+; GFX942-NEXT:    v_mov_b32_e32 v4, s12
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[2:3]
+; GFX942-NEXT:    s_cselect_b64 s[2:3], -1, 0
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[2:3]
+; GFX942-NEXT:    s_subb_u32 s2, s5, s7
+; GFX942-NEXT:    v_subrev_co_u32_e64 v4, s[0:1], s6, v2
+; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX942-NEXT:    s_subb_u32 s2, s2, 0
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v3
+; GFX942-NEXT:    v_mov_b32_e32 v3, s10
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v2, v4, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v4, s2
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v3, v4, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s11, s4
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s7
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s6, v1
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s7
+; GFX942-NEXT:    v_mov_b32_e32 v5, s1
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, s0
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v1, v2, vcc
+; GFX942-NEXT:    global_store_dwordx2 v0, v[2:3], s[8:9]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_urem_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    s_mov_b32 s11, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[6:7], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX942-IR-NEXT:    s_flbit_i32_b64 s10, s[6:7]
+; GFX942-IR-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
+; GFX942-IR-NEXT:    s_flbit_i32_b64 s18, s[2:3]
+; GFX942-IR-NEXT:    s_sub_u32 s12, s10, s18
+; GFX942-IR-NEXT:    s_subb_u32 s13, 0, 0
+; GFX942-IR-NEXT:    v_cmp_gt_u64_e64 s[14:15], s[12:13], 63
+; GFX942-IR-NEXT:    s_or_b64 s[14:15], s[8:9], s[14:15]
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[12:13], 63
+; GFX942-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
+; GFX942-IR-NEXT:    s_and_b64 s[8:9], s[14:15], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s9, 0, s3
+; GFX942-IR-NEXT:    s_cselect_b32 s8, 0, s2
+; GFX942-IR-NEXT:    s_or_b64 s[14:15], s[14:15], s[16:17]
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[14:15]
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB0_5
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    s_add_u32 s14, s12, 1
+; GFX942-IR-NEXT:    s_addc_u32 s15, s13, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[14:15], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GFX942-IR-NEXT:    s_sub_i32 s12, 63, s12
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[8:9]
+; GFX942-IR-NEXT:    s_lshl_b64 s[8:9], s[2:3], s12
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB0_4
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    s_lshr_b64 s[12:13], s[2:3], s14
+; GFX942-IR-NEXT:    s_add_u32 s16, s6, -1
+; GFX942-IR-NEXT:    s_addc_u32 s17, s7, -1
+; GFX942-IR-NEXT:    s_not_b64 s[4:5], s[10:11]
+; GFX942-IR-NEXT:    s_add_u32 s10, s4, s18
+; GFX942-IR-NEXT:    s_addc_u32 s11, s5, 0
+; GFX942-IR-NEXT:    s_mov_b64 s[14:15], 0
+; GFX942-IR-NEXT:    s_mov_b32 s5, 0
+; GFX942-IR-NEXT:  .LBB0_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    s_lshl_b64 s[12:13], s[12:13], 1
+; GFX942-IR-NEXT:    s_lshr_b32 s4, s9, 31
+; GFX942-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
+; GFX942-IR-NEXT:    s_or_b64 s[12:13], s[12:13], s[4:5]
+; GFX942-IR-NEXT:    s_or_b64 s[8:9], s[14:15], s[8:9]
+; GFX942-IR-NEXT:    s_sub_u32 s4, s16, s12
+; GFX942-IR-NEXT:    s_subb_u32 s4, s17, s13
+; GFX942-IR-NEXT:    s_ashr_i32 s14, s4, 31
+; GFX942-IR-NEXT:    s_mov_b32 s15, s14
+; GFX942-IR-NEXT:    s_and_b32 s4, s14, 1
+; GFX942-IR-NEXT:    s_and_b64 s[14:15], s[14:15], s[6:7]
+; GFX942-IR-NEXT:    s_sub_u32 s12, s12, s14
+; GFX942-IR-NEXT:    s_subb_u32 s13, s13, s15
+; GFX942-IR-NEXT:    s_add_u32 s10, s10, 1
+; GFX942-IR-NEXT:    s_addc_u32 s11, s11, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX942-IR-NEXT:    s_mov_b64 s[14:15], s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_scc0 .LBB0_3
+; GFX942-IR-NEXT:  .LBB0_4: ; %Flow7
+; GFX942-IR-NEXT:    s_lshl_b64 s[8:9], s[8:9], 1
+; GFX942-IR-NEXT:    s_or_b64 s[8:9], s[4:5], s[8:9]
+; GFX942-IR-NEXT:  .LBB0_5: ; %udiv-end
+; GFX942-IR-NEXT:    s_mul_i32 s4, s6, s9
+; GFX942-IR-NEXT:    s_mul_hi_u32 s5, s6, s8
+; GFX942-IR-NEXT:    s_add_i32 s4, s5, s4
+; GFX942-IR-NEXT:    s_mul_i32 s5, s7, s8
+; GFX942-IR-NEXT:    s_add_i32 s4, s4, s5
+; GFX942-IR-NEXT:    s_mul_i32 s5, s6, s8
+; GFX942-IR-NEXT:    s_sub_u32 s2, s2, s5
+; GFX942-IR-NEXT:    s_subb_u32 s3, s3, s4
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-IR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %result = urem i64 %x, %y
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -405,6 +620,196 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_urem_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v4, v2
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v5, v3
+; GFX942-NEXT:    v_sub_co_u32_e32 v13, vcc, 0, v2
+; GFX942-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-NEXT:    v_fmamk_f32 v4, v5, 0x4f800000, v4
+; GFX942-NEXT:    v_rcp_f32_e32 v4, v4
+; GFX942-NEXT:    v_subb_co_u32_e32 v14, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GFX942-NEXT:    v_mul_f32_e32 v5, 0x2f800000, v4
+; GFX942-NEXT:    v_trunc_f32_e32 v5, v5
+; GFX942-NEXT:    v_fmamk_f32 v4, v5, 0xcf800000, v4
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v12, v4
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v15, v5
+; GFX942-NEXT:    v_mul_lo_u32 v6, v14, v12
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v13, v12, 0
+; GFX942-NEXT:    v_mul_lo_u32 v7, v13, v15
+; GFX942-NEXT:    v_add3_u32 v5, v5, v7, v6
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v12, v5, 0
+; GFX942-NEXT:    v_mul_hi_u32 v8, v12, v4
+; GFX942-NEXT:    v_lshl_add_u64 v[6:7], v[8:9], 0, v[6:7]
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v15, v5, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v15, v4, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v6, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v8, vcc, v7, v5, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v11, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[10:11]
+; GFX942-NEXT:    v_add_co_u32_e32 v16, vcc, v12, v4
+; GFX942-NEXT:    v_mul_lo_u32 v7, v14, v16
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v15, vcc, v15, v5, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v6, v13, v15
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v13, v16, 0
+; GFX942-NEXT:    v_add3_u32 v5, v5, v6, v7
+; GFX942-NEXT:    v_mad_u64_u32 v[12:13], s[0:1], v16, v5, 0
+; GFX942-NEXT:    v_mul_hi_u32 v8, v16, v4
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v15, v5, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v15, v4, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[12:13]
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v10
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v8, vcc, v5, v11, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[6:7]
+; GFX942-NEXT:    v_add_co_u32_e32 v10, vcc, v16, v4
+; GFX942-NEXT:    v_mul_hi_u32 v8, v0, v10
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v6, vcc, v15, v5, vcc
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v0, v6, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[4:5]
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v1, v10, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v4, vcc, v4, v10
+; GFX942-NEXT:    v_mad_u64_u32 v[6:7], s[0:1], v1, v6, 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v8, vcc, v5, v11, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v7, vcc, 0, v7, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[8:9], 0, v[6:7]
+; GFX942-NEXT:    v_mul_lo_u32 v6, v3, v4
+; GFX942-NEXT:    v_mul_lo_u32 v7, v2, v5
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v2, v4, 0
+; GFX942-NEXT:    v_add3_u32 v5, v5, v7, v6
+; GFX942-NEXT:    v_sub_u32_e32 v6, v1, v5
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v6, v3, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v6, s[0:1], v0, v2
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v7, s[2:3], 0, v4, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v7, v3
+; GFX942-NEXT:    v_subb_co_u32_e64 v4, s[0:1], v4, v3, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v6, v2
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v1, v3
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v9, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[2:3], v7, v3
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v0, v2
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[2:3]
+; GFX942-NEXT:    v_sub_co_u32_e64 v9, s[0:1], v6, v2
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, 0, -1, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v4, s[0:1], 0, v4, s[0:1]
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v1, v3
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v8
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v2, v5, v2, vcc
+; GFX942-NEXT:    v_cndmask_b32_e64 v6, v6, v9, s[0:1]
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v2
+; GFX942-NEXT:    v_cndmask_b32_e64 v2, v7, v4, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v6, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_urem_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v4, v2
+; GFX942-IR-NEXT:    v_add_u32_e32 v4, 32, v4
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v5, v3
+; GFX942-IR-NEXT:    v_min_u32_e32 v12, v4, v5
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v4, v0
+; GFX942-IR-NEXT:    v_add_u32_e32 v4, 32, v4
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v5, v1
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e64 s[0:1], 0, v[0:1]
+; GFX942-IR-NEXT:    v_min_u32_e32 v10, v4, v5
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v8, vcc, v12, v10
+; GFX942-IR-NEXT:    v_mov_b32_e32 v11, 0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e64 v9, s[2:3], 0, 0, vcc
+; GFX942-IR-NEXT:    v_cmp_lt_u64_e32 vcc, 63, v[8:9]
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], s[0:1], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[8:9]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v5, v1, 0, s[0:1]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v4, v0, 0, s[0:1]
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB1_6
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[14:15], v[8:9], 0, 1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v4, 63, v8
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[6:7], 0
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[4:5], v4, v[0:1]
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB1_5
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    v_mov_b32_e32 v7, -1
+; GFX942-IR-NEXT:    v_not_b32_e32 v6, v12
+; GFX942-IR-NEXT:    v_lshrrev_b64 v[14:15], v14, v[0:1]
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[8:9], v[2:3], 0, -1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[10:11], v[6:7], 0, v[10:11]
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[12:13], 0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-IR-NEXT:  .LBB1_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[14:15], 1, v[14:15]
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v6, 31, v5
+; GFX942-IR-NEXT:    v_or_b32_e32 v14, v14, v6
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v6, vcc, v8, v14
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v6, vcc, v9, v15, vcc
+; GFX942-IR-NEXT:    v_or_b32_e32 v4, v12, v4
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v6
+; GFX942-IR-NEXT:    v_or_b32_e32 v5, v13, v5
+; GFX942-IR-NEXT:    v_and_b32_e32 v6, 1, v12
+; GFX942-IR-NEXT:    v_and_b32_e32 v13, v12, v3
+; GFX942-IR-NEXT:    v_and_b32_e32 v12, v12, v2
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v14, vcc, v14, v12
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[10:11], v[10:11], 0, 1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v15, vcc, v15, v13, vcc
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[12:13], v[6:7]
+; GFX942-IR-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_execnz .LBB1_3
+; GFX942-IR-NEXT:  ; %bb.4: ; %Flow
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:  .LBB1_5: ; %Flow4
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
+; GFX942-IR-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX942-IR-NEXT:    v_or_b32_e32 v4, v6, v4
+; GFX942-IR-NEXT:  .LBB1_6: ; %Flow5
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-IR-NEXT:    v_mul_lo_u32 v6, v3, v4
+; GFX942-IR-NEXT:    v_mul_lo_u32 v5, v2, v5
+; GFX942-IR-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v2, v4, 0
+; GFX942-IR-NEXT:    v_add3_u32 v3, v3, v5, v6
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 %x, %y
   ret i64 %result
 }
@@ -479,6 +884,70 @@ define amdgpu_kernel void @s_test_urem31_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_mov_b32_e32 v0, s0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_urem31_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s0, s[4:5], 0x38
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s6, s0, 1
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s2, s3, 1
+; GFX942-NEXT:    s_sub_i32 s3, 0, s6
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    s_mul_i32 s3, s3, s4
+; GFX942-NEXT:    s_mul_hi_u32 s3, s4, s3
+; GFX942-NEXT:    s_add_i32 s4, s4, s3
+; GFX942-NEXT:    s_mul_hi_u32 s3, s2, s4
+; GFX942-NEXT:    s_mul_i32 s3, s3, s6
+; GFX942-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-NEXT:    s_sub_i32 s3, s2, s6
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX942-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX942-NEXT:    s_sub_i32 s3, s2, s6
+; GFX942-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX942-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX942-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_urem31_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dword s0, s[4:5], 0x38
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_lshr_b32 s6, s0, 1
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s6
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_lshr_b32 s2, s3, 1
+; GFX942-IR-NEXT:    s_sub_i32 s3, 0, s6
+; GFX942-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-IR-NEXT:    s_mul_i32 s3, s3, s4
+; GFX942-IR-NEXT:    s_mul_hi_u32 s3, s4, s3
+; GFX942-IR-NEXT:    s_add_i32 s4, s4, s3
+; GFX942-IR-NEXT:    s_mul_hi_u32 s3, s2, s4
+; GFX942-IR-NEXT:    s_mul_i32 s3, s3, s6
+; GFX942-IR-NEXT:    s_sub_i32 s2, s2, s3
+; GFX942-IR-NEXT:    s_sub_i32 s3, s2, s6
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX942-IR-NEXT:    s_sub_i32 s3, s2, s6
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s2, s6
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s3, s2
+; GFX942-IR-NEXT:    v_mov_b32_e32 v0, s2
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = lshr i64 %x, 33
   %2 = lshr i64 %y, 33
   %result = urem i64 %1, %2
@@ -596,6 +1065,110 @@ define amdgpu_kernel void @s_test_urem31_v2i64(ptr addrspace(1) %out, <2 x i64>
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_urem31_v2i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s3, s13, 1
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT:    s_lshr_b32 s2, s15, 1
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX942-NEXT:    s_sub_i32 s6, 0, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    s_lshr_b32 s5, s9, 1
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX942-NEXT:    s_lshr_b32 s4, s11, 1
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX942-NEXT:    s_mul_i32 s6, s6, s7
+; GFX942-NEXT:    s_mul_hi_u32 s6, s7, s6
+; GFX942-NEXT:    s_add_i32 s7, s7, s6
+; GFX942-NEXT:    s_mul_hi_u32 s6, s5, s7
+; GFX942-NEXT:    s_mul_i32 s6, s6, s3
+; GFX942-NEXT:    s_sub_i32 s5, s5, s6
+; GFX942-NEXT:    s_sub_i32 s6, s5, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s5, s3
+; GFX942-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX942-NEXT:    s_sub_i32 s6, s5, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s5, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX942-NEXT:    s_cselect_b32 s3, s6, s5
+; GFX942-NEXT:    s_sub_i32 s5, 0, s2
+; GFX942-NEXT:    s_mul_i32 s5, s5, s8
+; GFX942-NEXT:    v_mov_b32_e32 v0, s3
+; GFX942-NEXT:    s_mul_hi_u32 s3, s8, s5
+; GFX942-NEXT:    s_add_i32 s8, s8, s3
+; GFX942-NEXT:    s_mul_hi_u32 s3, s4, s8
+; GFX942-NEXT:    s_mul_i32 s3, s3, s2
+; GFX942-NEXT:    s_sub_i32 s3, s4, s3
+; GFX942-NEXT:    s_sub_i32 s4, s3, s2
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX942-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX942-NEXT:    s_sub_i32 s4, s3, s2
+; GFX942-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX942-NEXT:    s_cselect_b32 s2, s4, s3
+; GFX942-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_urem31_v2i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-IR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_lshr_b32 s3, s13, 1
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX942-IR-NEXT:    s_lshr_b32 s2, s15, 1
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX942-IR-NEXT:    s_sub_i32 s6, 0, s3
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_lshr_b32 s5, s9, 1
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v2
+; GFX942-IR-NEXT:    s_lshr_b32 s4, s11, 1
+; GFX942-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, 0x4f7ffffe, v2
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX942-IR-NEXT:    s_mul_i32 s6, s6, s7
+; GFX942-IR-NEXT:    s_mul_hi_u32 s6, s7, s6
+; GFX942-IR-NEXT:    s_add_i32 s7, s7, s6
+; GFX942-IR-NEXT:    s_mul_hi_u32 s6, s5, s7
+; GFX942-IR-NEXT:    s_mul_i32 s6, s6, s3
+; GFX942-IR-NEXT:    s_sub_i32 s5, s5, s6
+; GFX942-IR-NEXT:    s_sub_i32 s6, s5, s3
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s5, s3
+; GFX942-IR-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX942-IR-NEXT:    s_sub_i32 s6, s5, s3
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s5, s3
+; GFX942-IR-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX942-IR-NEXT:    s_cselect_b32 s3, s6, s5
+; GFX942-IR-NEXT:    s_sub_i32 s5, 0, s2
+; GFX942-IR-NEXT:    s_mul_i32 s5, s5, s8
+; GFX942-IR-NEXT:    v_mov_b32_e32 v0, s3
+; GFX942-IR-NEXT:    s_mul_hi_u32 s3, s8, s5
+; GFX942-IR-NEXT:    s_add_i32 s8, s8, s3
+; GFX942-IR-NEXT:    s_mul_hi_u32 s3, s4, s8
+; GFX942-IR-NEXT:    s_mul_i32 s3, s3, s2
+; GFX942-IR-NEXT:    s_sub_i32 s3, s4, s3
+; GFX942-IR-NEXT:    s_sub_i32 s4, s3, s2
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX942-IR-NEXT:    s_cselect_b32 s3, s4, s3
+; GFX942-IR-NEXT:    s_sub_i32 s4, s3, s2
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s3, s2
+; GFX942-IR-NEXT:    s_cselect_b32 s2, s4, s3
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, s2
+; GFX942-IR-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = lshr <2 x i64> %x, <i64 33, i64 33>
   %2 = lshr <2 x i64> %y, <i64 33, i64 33>
   %result = urem <2 x i64> %1, %2
@@ -653,6 +1226,56 @@ define amdgpu_kernel void @s_test_urem24_i64(ptr addrspace(1) %out, i64 %x, i64
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_urem24_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dword s6, s[4:5], 0x38
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s2, s6, 8
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-NEXT:    s_lshr_b32 s3, s3, 8
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, s3
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v3
+; GFX942-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GFX942-NEXT:    v_sub_u32_e32 v0, s3, v0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_urem24_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dword s6, s[4:5], 0x38
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_lshr_b32 s2, s6, 8
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-IR-NEXT:    s_lshr_b32 s3, s3, 8
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v2, s3
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v3, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v3, v2, v3
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v4, v3
+; GFX942-IR-NEXT:    v_fma_f32 v2, -v3, v0, v2
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-IR-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GFX942-IR-NEXT:    v_sub_u32_e32 v0, s3, v0
+; GFX942-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = lshr i64 %x, 40
   %2 = lshr i64 %y, 40
   %result = urem i64 %1, %2
@@ -756,6 +1379,96 @@ define amdgpu_kernel void @s_test_urem23_64_v2i64(ptr addrspace(1) %out, <2 x i6
 ; GCN-IR-NEXT:    s_waitcnt lgkmcnt(0)
 ; GCN-IR-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_urem23_64_v2i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s3, s13, 1
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX942-NEXT:    s_lshr_b32 s2, s15, 9
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX942-NEXT:    s_lshr_b32 s4, s11, 9
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GFX942-NEXT:    s_sub_i32 s6, 0, s3
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    s_lshr_b32 s5, s9, 1
+; GFX942-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX942-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX942-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX942-NEXT:    s_mul_i32 s6, s6, s7
+; GFX942-NEXT:    s_mul_hi_u32 s6, s7, s6
+; GFX942-NEXT:    s_add_i32 s7, s7, s6
+; GFX942-NEXT:    s_mul_hi_u32 s6, s5, s7
+; GFX942-NEXT:    s_mul_i32 s6, s6, s3
+; GFX942-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX942-NEXT:    s_sub_i32 s5, s5, s6
+; GFX942-NEXT:    s_sub_i32 s6, s5, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s5, s3
+; GFX942-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
+; GFX942-NEXT:    s_sub_i32 s6, s5, s3
+; GFX942-NEXT:    s_cmp_ge_u32 s5, s3
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GFX942-NEXT:    s_cselect_b32 s3, s6, s5
+; GFX942-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX942-NEXT:    v_and_b32_e32 v2, 0x7fffff, v0
+; GFX942-NEXT:    v_mov_b32_e32 v0, s3
+; GFX942-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_urem23_64_v2i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x34
+; GFX942-IR-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_lshr_b32 s3, s13, 1
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s3
+; GFX942-IR-NEXT:    s_lshr_b32 s2, s15, 9
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v2, s2
+; GFX942-IR-NEXT:    s_lshr_b32 s4, s11, 9
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v3, s4
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v4, v2
+; GFX942-IR-NEXT:    s_sub_i32 s6, 0, s3
+; GFX942-IR-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-IR-NEXT:    s_lshr_b32 s5, s9, 1
+; GFX942-IR-NEXT:    v_mul_f32_e32 v4, v3, v4
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v4, v4
+; GFX942-IR-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX942-IR-NEXT:    s_mul_i32 s6, s6, s7
+; GFX942-IR-NEXT:    s_mul_hi_u32 s6, s7, s6
+; GFX942-IR-NEXT:    s_add_i32 s7, s7, s6
+; GFX942-IR-NEXT:    s_mul_hi_u32 s6, s5, s7
+; GFX942-IR-NEXT:    s_mul_i32 s6, s6, s3
+; GFX942-IR-NEXT:    v_fma_f32 v3, -v4, v2, v3
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v4, v4
+; GFX942-IR-NEXT:    s_sub_i32 s5, s5, s6
+; GFX942-IR-NEXT:    s_sub_i32 s6, s5, s3
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s5, s3
+; GFX942-IR-NEXT:    s_cselect_b32 s5, s6, s5
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v3|, v2
+; GFX942-IR-NEXT:    s_sub_i32 s6, s5, s3
+; GFX942-IR-NEXT:    s_cmp_ge_u32 s5, s3
+; GFX942-IR-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v4, vcc
+; GFX942-IR-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GFX942-IR-NEXT:    s_cselect_b32 s3, s6, s5
+; GFX942-IR-NEXT:    v_sub_u32_e32 v0, s4, v0
+; GFX942-IR-NEXT:    v_and_b32_e32 v2, 0x7fffff, v0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v0, s3
+; GFX942-IR-NEXT:    v_mov_b32_e32 v3, v1
+; GFX942-IR-NEXT:    global_store_dwordx4 v1, v[0:3], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %1 = lshr <2 x i64> %x, <i64 33, i64 41>
   %2 = lshr <2 x i64> %y, <i64 33, i64 41>
   %result = urem <2 x i64> %1, %2
@@ -942,6 +1655,201 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_urem_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[8:11], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s10
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, s11
+; GFX942-NEXT:    s_sub_u32 s0, 0, s10
+; GFX942-NEXT:    s_subb_u32 s1, 0, s11
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
+; GFX942-NEXT:    v_rcp_f32_e32 v0, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
+; GFX942-NEXT:    v_mul_f32_e32 v1, 0x2f800000, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v1, v1
+; GFX942-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v1
+; GFX942-NEXT:    v_readfirstlane_b32 s3, v0
+; GFX942-NEXT:    s_mul_i32 s4, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s6, s0, s3
+; GFX942-NEXT:    s_mul_i32 s5, s1, s3
+; GFX942-NEXT:    s_add_i32 s4, s6, s4
+; GFX942-NEXT:    s_mul_i32 s7, s0, s3
+; GFX942-NEXT:    s_add_i32 s4, s4, s5
+; GFX942-NEXT:    s_mul_hi_u32 s6, s3, s7
+; GFX942-NEXT:    s_mul_hi_u32 s5, s3, s4
+; GFX942-NEXT:    s_mul_i32 s3, s3, s4
+; GFX942-NEXT:    s_add_u32 s3, s6, s3
+; GFX942-NEXT:    s_addc_u32 s5, 0, s5
+; GFX942-NEXT:    s_mul_i32 s6, s2, s7
+; GFX942-NEXT:    s_mul_hi_u32 s12, s2, s7
+; GFX942-NEXT:    s_add_u32 s3, s3, s6
+; GFX942-NEXT:    s_mul_hi_u32 s13, s2, s4
+; GFX942-NEXT:    s_addc_u32 s3, s5, s12
+; GFX942-NEXT:    s_addc_u32 s5, s13, 0
+; GFX942-NEXT:    s_mul_i32 s4, s2, s4
+; GFX942-NEXT:    s_add_u32 s3, s3, s4
+; GFX942-NEXT:    s_addc_u32 s4, 0, s5
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, s3, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s2, s2, s4
+; GFX942-NEXT:    v_readfirstlane_b32 s4, v0
+; GFX942-NEXT:    s_mul_i32 s3, s0, s2
+; GFX942-NEXT:    s_mul_hi_u32 s5, s0, s4
+; GFX942-NEXT:    s_add_i32 s3, s5, s3
+; GFX942-NEXT:    s_mul_i32 s1, s1, s4
+; GFX942-NEXT:    s_add_i32 s3, s3, s1
+; GFX942-NEXT:    s_mul_i32 s0, s0, s4
+; GFX942-NEXT:    s_mul_hi_u32 s5, s2, s0
+; GFX942-NEXT:    s_mul_i32 s6, s2, s0
+; GFX942-NEXT:    s_mul_i32 s12, s4, s3
+; GFX942-NEXT:    s_mul_hi_u32 s0, s4, s0
+; GFX942-NEXT:    s_mul_hi_u32 s7, s4, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s12
+; GFX942-NEXT:    s_addc_u32 s4, 0, s7
+; GFX942-NEXT:    s_add_u32 s0, s0, s6
+; GFX942-NEXT:    s_mul_hi_u32 s1, s2, s3
+; GFX942-NEXT:    s_addc_u32 s0, s4, s5
+; GFX942-NEXT:    s_addc_u32 s1, s1, 0
+; GFX942-NEXT:    s_mul_i32 s3, s2, s3
+; GFX942-NEXT:    s_add_u32 s0, s0, s3
+; GFX942-NEXT:    s_addc_u32 s1, 0, s1
+; GFX942-NEXT:    v_add_co_u32_e32 v0, vcc, s0, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_addc_u32 s0, s2, s1
+; GFX942-NEXT:    v_readfirstlane_b32 s2, v0
+; GFX942-NEXT:    s_mul_hi_u32 s1, s0, 24
+; GFX942-NEXT:    s_mul_i32 s0, s0, 24
+; GFX942-NEXT:    s_mul_hi_u32 s2, s2, 24
+; GFX942-NEXT:    s_add_u32 s0, s2, s0
+; GFX942-NEXT:    s_addc_u32 s0, 0, s1
+; GFX942-NEXT:    s_mul_i32 s1, s11, s0
+; GFX942-NEXT:    s_mul_hi_u32 s2, s10, s0
+; GFX942-NEXT:    s_add_i32 s4, s2, s1
+; GFX942-NEXT:    s_mul_i32 s0, s10, s0
+; GFX942-NEXT:    s_sub_i32 s1, 0, s4
+; GFX942-NEXT:    v_sub_co_u32_e64 v0, s[2:3], 24, s0
+; GFX942-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX942-NEXT:    s_subb_u32 s5, s1, s11
+; GFX942-NEXT:    v_subrev_co_u32_e32 v1, vcc, s10, v0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s6, s5, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s6, s11
+; GFX942-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e64 s[0:1], s10, v1
+; GFX942-NEXT:    s_cmp_eq_u32 s6, s11
+; GFX942-NEXT:    v_mov_b32_e32 v4, s7
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, 0, -1, s[0:1]
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v3, v4, v3, s[0:1]
+; GFX942-NEXT:    s_subb_u32 s0, s5, s11
+; GFX942-NEXT:    v_subrev_co_u32_e32 v4, vcc, s10, v1
+; GFX942-NEXT:    s_cmp_lg_u64 vcc, 0
+; GFX942-NEXT:    s_subb_u32 s0, s0, 0
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v3
+; GFX942-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v3, v1, v4, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v4, s0
+; GFX942-NEXT:    s_subb_u32 s0, 0, s4
+; GFX942-NEXT:    v_mov_b32_e32 v1, s6
+; GFX942-NEXT:    s_cmp_ge_u32 s0, s11
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
+; GFX942-NEXT:    s_cselect_b32 s1, -1, 0
+; GFX942-NEXT:    v_cmp_le_u32_e32 vcc, s10, v0
+; GFX942-NEXT:    s_cmp_eq_u32 s0, s11
+; GFX942-NEXT:    v_mov_b32_e32 v5, s1
+; GFX942-NEXT:    v_cndmask_b32_e64 v4, 0, -1, vcc
+; GFX942-NEXT:    s_cselect_b64 vcc, -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v4, v5, v4, vcc
+; GFX942-NEXT:    v_mov_b32_e32 v5, s0
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v4
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_urem_k_num_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX942-IR-NEXT:    s_flbit_i32_b64 s12, s[2:3]
+; GFX942-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX942-IR-NEXT:    s_add_u32 s8, s12, 0xffffffc5
+; GFX942-IR-NEXT:    s_addc_u32 s9, 0, -1
+; GFX942-IR-NEXT:    v_cmp_gt_u64_e64 s[10:11], s[8:9], 63
+; GFX942-IR-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[8:9], 63
+; GFX942-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX942-IR-NEXT:    s_and_b64 s[6:7], s[10:11], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s6, 0, 24
+; GFX942-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[14:15]
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[10:11]
+; GFX942-IR-NEXT:    s_mov_b32 s7, 0
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB6_5
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    s_add_u32 s10, s8, 1
+; GFX942-IR-NEXT:    s_addc_u32 s11, s9, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX942-IR-NEXT:    s_sub_i32 s8, 63, s8
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GFX942-IR-NEXT:    s_lshl_b64 s[6:7], 24, s8
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB6_4
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    s_lshr_b64 s[10:11], 24, s10
+; GFX942-IR-NEXT:    s_add_u32 s14, s2, -1
+; GFX942-IR-NEXT:    s_addc_u32 s15, s3, -1
+; GFX942-IR-NEXT:    s_sub_u32 s8, 58, s12
+; GFX942-IR-NEXT:    s_subb_u32 s9, 0, 0
+; GFX942-IR-NEXT:    s_mov_b64 s[12:13], 0
+; GFX942-IR-NEXT:    s_mov_b32 s5, 0
+; GFX942-IR-NEXT:  .LBB6_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
+; GFX942-IR-NEXT:    s_lshr_b32 s4, s7, 31
+; GFX942-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX942-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[4:5]
+; GFX942-IR-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
+; GFX942-IR-NEXT:    s_sub_u32 s4, s14, s10
+; GFX942-IR-NEXT:    s_subb_u32 s4, s15, s11
+; GFX942-IR-NEXT:    s_ashr_i32 s12, s4, 31
+; GFX942-IR-NEXT:    s_mov_b32 s13, s12
+; GFX942-IR-NEXT:    s_and_b32 s4, s12, 1
+; GFX942-IR-NEXT:    s_and_b64 s[12:13], s[12:13], s[2:3]
+; GFX942-IR-NEXT:    s_sub_u32 s10, s10, s12
+; GFX942-IR-NEXT:    s_subb_u32 s11, s11, s13
+; GFX942-IR-NEXT:    s_add_u32 s8, s8, 1
+; GFX942-IR-NEXT:    s_addc_u32 s9, s9, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[8:9], 0
+; GFX942-IR-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_scc0 .LBB6_3
+; GFX942-IR-NEXT:  .LBB6_4: ; %Flow6
+; GFX942-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX942-IR-NEXT:    s_or_b64 s[6:7], s[4:5], s[6:7]
+; GFX942-IR-NEXT:  .LBB6_5: ; %udiv-end
+; GFX942-IR-NEXT:    s_mul_i32 s4, s2, s7
+; GFX942-IR-NEXT:    s_mul_hi_u32 s5, s2, s6
+; GFX942-IR-NEXT:    s_add_i32 s4, s5, s4
+; GFX942-IR-NEXT:    s_mul_i32 s3, s3, s6
+; GFX942-IR-NEXT:    s_add_i32 s4, s4, s3
+; GFX942-IR-NEXT:    s_mul_i32 s2, s2, s6
+; GFX942-IR-NEXT:    s_sub_u32 s2, 24, s2
+; GFX942-IR-NEXT:    s_subb_u32 s3, 0, s4
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-IR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %result = urem i64 24, %x
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -1055,6 +1963,109 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v2, v1, vcc
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[8:11], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_urem_k_den_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_mul_i32 s7, s3, 0xaaaaaaab
+; GFX942-NEXT:    s_mul_hi_u32 s8, s2, 0xaaaaaaab
+; GFX942-NEXT:    s_mul_hi_u32 s6, s3, 0xaaaaaaab
+; GFX942-NEXT:    s_add_u32 s7, s7, s8
+; GFX942-NEXT:    s_mul_i32 s5, s2, 0xaaaaaaaa
+; GFX942-NEXT:    s_addc_u32 s6, s6, 0
+; GFX942-NEXT:    s_mul_hi_u32 s4, s2, 0xaaaaaaaa
+; GFX942-NEXT:    s_add_u32 s5, s5, s7
+; GFX942-NEXT:    s_addc_u32 s4, s4, 0
+; GFX942-NEXT:    s_add_u32 s4, s6, s4
+; GFX942-NEXT:    s_addc_u32 s5, 0, 0
+; GFX942-NEXT:    s_mul_i32 s7, s3, 0xaaaaaaaa
+; GFX942-NEXT:    s_mul_hi_u32 s6, s3, 0xaaaaaaaa
+; GFX942-NEXT:    s_add_u32 s4, s7, s4
+; GFX942-NEXT:    s_addc_u32 s5, s6, s5
+; GFX942-NEXT:    v_mov_b32_e32 v0, s4
+; GFX942-NEXT:    v_alignbit_b32 v0, s5, v0, 4
+; GFX942-NEXT:    s_lshr_b32 s4, s5, 4
+; GFX942-NEXT:    v_mul_lo_u32 v1, v0, 24
+; GFX942-NEXT:    v_mul_hi_u32 v0, v0, 24
+; GFX942-NEXT:    s_mul_i32 s4, s4, 24
+; GFX942-NEXT:    v_add_u32_e32 v3, s4, v0
+; GFX942-NEXT:    v_mov_b32_e32 v4, s3
+; GFX942-NEXT:    v_sub_co_u32_e32 v0, vcc, s2, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e32 v1, vcc, v4, v3, vcc
+; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_urem_k_den_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[2:3], 0
+; GFX942-IR-NEXT:    s_flbit_i32_b64 s12, s[2:3]
+; GFX942-IR-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX942-IR-NEXT:    s_sub_u32 s8, 59, s12
+; GFX942-IR-NEXT:    s_subb_u32 s9, 0, 0
+; GFX942-IR-NEXT:    v_cmp_gt_u64_e64 s[6:7], s[8:9], 63
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[8:9], 63
+; GFX942-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX942-IR-NEXT:    s_and_b64 s[6:7], s[4:5], exec
+; GFX942-IR-NEXT:    s_cselect_b32 s7, 0, s3
+; GFX942-IR-NEXT:    s_cselect_b32 s6, 0, s2
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[4:5]
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB7_5
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    s_add_u32 s10, s8, 1
+; GFX942-IR-NEXT:    s_addc_u32 s11, s9, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[10:11], 0
+; GFX942-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GFX942-IR-NEXT:    s_sub_i32 s8, 63, s8
+; GFX942-IR-NEXT:    s_andn2_b64 vcc, exec, s[6:7]
+; GFX942-IR-NEXT:    s_lshl_b64 s[6:7], s[2:3], s8
+; GFX942-IR-NEXT:    s_cbranch_vccz .LBB7_4
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    s_lshr_b64 s[10:11], s[2:3], s10
+; GFX942-IR-NEXT:    s_add_u32 s8, s12, 0xffffffc4
+; GFX942-IR-NEXT:    s_addc_u32 s9, 0, -1
+; GFX942-IR-NEXT:    s_mov_b64 s[12:13], 0
+; GFX942-IR-NEXT:    s_mov_b32 s5, 0
+; GFX942-IR-NEXT:  .LBB7_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    s_lshl_b64 s[10:11], s[10:11], 1
+; GFX942-IR-NEXT:    s_lshr_b32 s4, s7, 31
+; GFX942-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX942-IR-NEXT:    s_or_b64 s[10:11], s[10:11], s[4:5]
+; GFX942-IR-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
+; GFX942-IR-NEXT:    s_sub_u32 s4, 23, s10
+; GFX942-IR-NEXT:    s_subb_u32 s4, 0, s11
+; GFX942-IR-NEXT:    s_ashr_i32 s12, s4, 31
+; GFX942-IR-NEXT:    s_and_b32 s4, s12, 1
+; GFX942-IR-NEXT:    s_and_b32 s12, s12, 24
+; GFX942-IR-NEXT:    s_sub_u32 s10, s10, s12
+; GFX942-IR-NEXT:    s_subb_u32 s11, s11, 0
+; GFX942-IR-NEXT:    s_add_u32 s8, s8, 1
+; GFX942-IR-NEXT:    s_addc_u32 s9, s9, 0
+; GFX942-IR-NEXT:    s_cmp_eq_u64 s[8:9], 0
+; GFX942-IR-NEXT:    s_mov_b64 s[12:13], s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_scc0 .LBB7_3
+; GFX942-IR-NEXT:  .LBB7_4: ; %Flow6
+; GFX942-IR-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX942-IR-NEXT:    s_or_b64 s[6:7], s[4:5], s[6:7]
+; GFX942-IR-NEXT:  .LBB7_5: ; %udiv-end
+; GFX942-IR-NEXT:    s_mul_i32 s4, s7, 24
+; GFX942-IR-NEXT:    s_mul_hi_u32 s5, s6, 24
+; GFX942-IR-NEXT:    s_add_i32 s5, s5, s4
+; GFX942-IR-NEXT:    s_mul_i32 s4, s6, 24
+; GFX942-IR-NEXT:    s_sub_u32 s2, s2, s4
+; GFX942-IR-NEXT:    s_subb_u32 s3, s3, s5
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
+; GFX942-IR-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %result = urem i64 %x, 24
   store i64 %result, ptr addrspace(1) %out
   ret void
@@ -1244,6 +2255,182 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, 0, v1, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_urem_pow2_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v2, v0
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v3, v1
+; GFX942-NEXT:    v_sub_co_u32_e32 v11, vcc, 0, v0
+; GFX942-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-NEXT:    v_fmamk_f32 v2, v3, 0x4f800000, v2
+; GFX942-NEXT:    v_rcp_f32_e32 v2, v2
+; GFX942-NEXT:    v_subb_co_u32_e32 v12, vcc, 0, v1, vcc
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x5f7ffffc, v2
+; GFX942-NEXT:    v_mul_f32_e32 v3, 0x2f800000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v3, v3
+; GFX942-NEXT:    v_fmamk_f32 v2, v3, 0xcf800000, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v10, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v13, v3
+; GFX942-NEXT:    v_mul_lo_u32 v4, v12, v10
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v11, v10, 0
+; GFX942-NEXT:    v_mul_lo_u32 v5, v11, v13
+; GFX942-NEXT:    v_add3_u32 v3, v3, v5, v4
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v10, v3, 0
+; GFX942-NEXT:    v_mul_hi_u32 v6, v10, v2
+; GFX942-NEXT:    v_lshl_add_u64 v[4:5], v[6:7], 0, v[4:5]
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v13, v3, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v13, v2, 0
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v4, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v6, vcc, v5, v3, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[6:7], 0, v[8:9]
+; GFX942-NEXT:    v_add_co_u32_e32 v14, vcc, v10, v2
+; GFX942-NEXT:    v_mul_lo_u32 v5, v12, v14
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v13, vcc, v13, v3, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v4, v11, v13
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v11, v14, 0
+; GFX942-NEXT:    v_add3_u32 v3, v3, v4, v5
+; GFX942-NEXT:    v_mad_u64_u32 v[10:11], s[0:1], v14, v3, 0
+; GFX942-NEXT:    v_mul_hi_u32 v6, v14, v2
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v13, v3, 0
+; GFX942-NEXT:    v_mad_u64_u32 v[8:9], s[0:1], v13, v2, 0
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[6:7], 0, v[10:11]
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v8
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v6, vcc, v3, v9, vcc
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; GFX942-NEXT:    v_lshl_add_u64 v[2:3], v[6:7], 0, v[4:5]
+; GFX942-NEXT:    v_add_co_u32_e32 v2, vcc, v14, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v2, vcc, v13, v3, vcc
+; GFX942-NEXT:    v_lshrrev_b32_e32 v5, 17, v2
+; GFX942-NEXT:    v_mad_u64_u32 v[2:3], s[0:1], v0, v5, 0
+; GFX942-NEXT:    v_mov_b32_e32 v4, v3
+; GFX942-NEXT:    v_mad_u64_u32 v[4:5], s[0:1], v1, v5, v[4:5]
+; GFX942-NEXT:    v_sub_u32_e32 v3, 0, v4
+; GFX942-NEXT:    v_sub_co_u32_e32 v2, vcc, 0x8000, v2
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v1, vcc
+; GFX942-NEXT:    v_sub_co_u32_e64 v5, s[0:1], v2, v0
+; GFX942-NEXT:    v_subb_co_u32_e32 v4, vcc, 0, v4, vcc
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v6, s[2:3], 0, v3, s[0:1]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v6, v1
+; GFX942-NEXT:    v_subb_co_u32_e64 v3, s[0:1], v3, v1, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_ge_u32_e64 s[2:3], v5, v0
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v4, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v8, 0, -1, s[2:3]
+; GFX942-NEXT:    v_cmp_eq_u32_e64 s[2:3], v6, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, v7, v8, s[2:3]
+; GFX942-NEXT:    v_sub_co_u32_e64 v8, s[0:1], v5, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_subbrev_co_u32_e64 v3, s[0:1], 0, v3, s[0:1]
+; GFX942-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, v7
+; GFX942-NEXT:    v_cndmask_b32_e64 v7, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_ge_u32_e32 vcc, v2, v0
+; GFX942-NEXT:    v_cndmask_b32_e64 v5, v5, v8, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, -1, vcc
+; GFX942-NEXT:    v_cmp_eq_u32_e32 vcc, v4, v1
+; GFX942-NEXT:    v_cndmask_b32_e64 v1, v6, v3, s[0:1]
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v7, v0, vcc
+; GFX942-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e32 v0, v2, v5, vcc
+; GFX942-NEXT:    v_cndmask_b32_e32 v1, v4, v1, vcc
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_urem_pow2_k_num_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX942-IR-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX942-IR-NEXT:    s_movk_i32 s0, 0xffd0
+; GFX942-IR-NEXT:    v_min_u32_e32 v8, v2, v3
+; GFX942-IR-NEXT:    v_mov_b32_e32 v9, 0
+; GFX942-IR-NEXT:    s_mov_b32 s1, -1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[6:7], v[8:9], 0, s[0:1]
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], 63, v[6:7]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v2, 0x8000
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v2, v2, 0, s[0:1]
+; GFX942-IR-NEXT:    s_xor_b64 s[0:1], s[0:1], -1
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[6:7]
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[0:1], vcc
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB8_6
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[10:11], v[6:7], 0, 1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v2, 63, v6
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0x8000
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[4:5], 0
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], v2, s[4:5]
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB8_5
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v8, vcc, 47, v8
+; GFX942-IR-NEXT:    v_lshrrev_b64 v[10:11], v10, s[4:5]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e64 v9, s[4:5], 0, 0, vcc
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[6:7], v[0:1], 0, -1
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[12:13], 0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-IR-NEXT:  .LBB8_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[10:11], 1, v[10:11]
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
+; GFX942-IR-NEXT:    v_or_b32_e32 v10, v10, v4
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v4, vcc, v6, v10
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v4, vcc, v7, v11, vcc
+; GFX942-IR-NEXT:    v_or_b32_e32 v2, v12, v2
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v12, 31, v4
+; GFX942-IR-NEXT:    v_or_b32_e32 v3, v13, v3
+; GFX942-IR-NEXT:    v_and_b32_e32 v4, 1, v12
+; GFX942-IR-NEXT:    v_and_b32_e32 v13, v12, v1
+; GFX942-IR-NEXT:    v_and_b32_e32 v12, v12, v0
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v10, vcc, v10, v12
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[8:9], v[8:9], 0, 1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v11, vcc, v11, v13, vcc
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[12:13], v[4:5]
+; GFX942-IR-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_execnz .LBB8_3
+; GFX942-IR-NEXT:  ; %bb.4: ; %Flow
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:  .LBB8_5: ; %Flow4
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX942-IR-NEXT:    v_or_b32_e32 v9, v5, v3
+; GFX942-IR-NEXT:    v_or_b32_e32 v2, v4, v2
+; GFX942-IR-NEXT:  .LBB8_6: ; %Flow5
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-IR-NEXT:    v_mul_lo_u32 v3, v1, v2
+; GFX942-IR-NEXT:    v_mul_lo_u32 v4, v0, v9
+; GFX942-IR-NEXT:    v_mad_u64_u32 v[0:1], s[0:1], v0, v2, 0
+; GFX942-IR-NEXT:    v_add3_u32 v1, v1, v4, v3
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, 0x8000, v0
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, 0, v1, vcc
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 32768, %x
   ret i64 %result
 }
@@ -1331,6 +2518,90 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, v0, v2
 ; GCN-IR-NEXT:    v_subb_u32_e32 v1, vcc, v1, v3, vcc
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_urem_pow2_k_den_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_urem_pow2_k_den_i64:
+; GFX942-IR:       ; %bb.0: ; %_udiv-special-cases
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v2, v0
+; GFX942-IR-NEXT:    v_add_u32_e32 v2, 32, v2
+; GFX942-IR-NEXT:    v_ffbh_u32_e32 v3, v1
+; GFX942-IR-NEXT:    v_min_u32_e32 v6, v2, v3
+; GFX942-IR-NEXT:    v_sub_co_u32_e64 v2, s[0:1], 48, v6
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e64 v3, s[0:1], 0, 0, s[0:1]
+; GFX942-IR-NEXT:    v_cmp_lt_u64_e64 s[0:1], 63, v[2:3]
+; GFX942-IR-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], s[0:1], -1
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 63, v[2:3]
+; GFX942-IR-NEXT:    v_mov_b32_e32 v7, 0
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v5, v1, 0, s[0:1]
+; GFX942-IR-NEXT:    v_cndmask_b32_e64 v4, v0, 0, s[0:1]
+; GFX942-IR-NEXT:    s_and_b64 s[2:3], s[2:3], vcc
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB9_6
+; GFX942-IR-NEXT:  ; %bb.1: ; %udiv-bb1
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[8:9], v[2:3], 0, 1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v2, 63, v2
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[4:5], 0
+; GFX942-IR-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], v2, v[0:1]
+; GFX942-IR-NEXT:    s_and_saveexec_b64 s[2:3], vcc
+; GFX942-IR-NEXT:    s_xor_b64 s[2:3], exec, s[2:3]
+; GFX942-IR-NEXT:    s_cbranch_execz .LBB9_5
+; GFX942-IR-NEXT:  ; %bb.2: ; %udiv-preheader
+; GFX942-IR-NEXT:    s_movk_i32 s4, 0xffcf
+; GFX942-IR-NEXT:    s_mov_b32 s5, -1
+; GFX942-IR-NEXT:    v_lshrrev_b64 v[8:9], v8, v[0:1]
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[6:7], v[6:7], 0, s[4:5]
+; GFX942-IR-NEXT:    s_mov_b64 s[4:5], 0
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[10:11], 0
+; GFX942-IR-NEXT:    s_movk_i32 s6, 0x7fff
+; GFX942-IR-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-IR-NEXT:  .LBB9_3: ; %udiv-do-while
+; GFX942-IR-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v4, 31, v3
+; GFX942-IR-NEXT:    v_or_b32_e32 v8, v8, v4
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v4, vcc, s6, v8
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v4, vcc, 0, v9, vcc
+; GFX942-IR-NEXT:    v_or_b32_e32 v2, v10, v2
+; GFX942-IR-NEXT:    v_ashrrev_i32_e32 v10, 31, v4
+; GFX942-IR-NEXT:    v_and_b32_e32 v4, 1, v10
+; GFX942-IR-NEXT:    v_and_b32_e32 v10, 0x8000, v10
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v8, vcc, v8, v10
+; GFX942-IR-NEXT:    v_lshl_add_u64 v[6:7], v[6:7], 0, 1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_subbrev_co_u32_e32 v9, vcc, 0, v9, vcc
+; GFX942-IR-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GFX942-IR-NEXT:    v_or_b32_e32 v3, v11, v3
+; GFX942-IR-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX942-IR-NEXT:    v_mov_b64_e32 v[10:11], v[4:5]
+; GFX942-IR-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:    s_cbranch_execnz .LBB9_3
+; GFX942-IR-NEXT:  ; %bb.4: ; %Flow
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX942-IR-NEXT:  .LBB9_5: ; %Flow4
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[2:3]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GFX942-IR-NEXT:    v_or_b32_e32 v5, v5, v3
+; GFX942-IR-NEXT:    v_or_b32_e32 v4, v4, v2
+; GFX942-IR-NEXT:  .LBB9_6: ; %Flow5
+; GFX942-IR-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX942-IR-NEXT:    v_lshlrev_b64 v[2:3], 15, v[4:5]
+; GFX942-IR-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v2
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %result = urem i64 %x, 32768
   ret i64 %result
 }
@@ -1381,6 +2652,52 @@ define amdgpu_kernel void @s_test_urem24_k_num_i64(ptr addrspace(1) %out, i64 %x
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_urem24_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s2, s3, 8
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-NEXT:    s_mov_b32 s3, 0x41c00000
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX942-NEXT:    v_fma_f32 v2, -v2, v0, s3
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GFX942-NEXT:    v_sub_u32_e32 v0, 24, v0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_urem24_k_num_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_lshr_b32 s2, s3, 8
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-IR-NEXT:    s_mov_b32 s3, 0x41c00000
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v0
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX942-IR-NEXT:    v_fma_f32 v2, -v2, v0, s3
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v0
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
+; GFX942-IR-NEXT:    v_mul_lo_u32 v0, v0, s2
+; GFX942-IR-NEXT:    v_sub_u32_e32 v0, 24, v0
+; GFX942-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %x.shr = lshr i64 %x, 40
   %result = urem i64 24, %x.shr
   store i64 %result, ptr addrspace(1) %out
@@ -1437,6 +2754,50 @@ define amdgpu_kernel void @s_test_urem24_k_den_i64(ptr addrspace(1) %out, i64 %x
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-IR-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-IR-NEXT:    s_endpgm
+;
+; GFX942-LABEL: s_test_urem24_k_den_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_lshr_b32 s2, s3, 8
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-NEXT:    s_mov_b32 s3, 0x46b6fe00
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x38331158, v0
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX942-NEXT:    v_fmamk_f32 v0, v2, 0xc6b6fe00, v0
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s3
+; GFX942-NEXT:    s_movk_i32 s3, 0x5b7f
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v0, v0, s3
+; GFX942-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-NEXT:    s_endpgm
+;
+; GFX942-IR-LABEL: s_test_urem24_k_den_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-IR-NEXT:    s_lshr_b32 s2, s3, 8
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v0, s2
+; GFX942-IR-NEXT:    s_mov_b32 s3, 0x46b6fe00
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, 0x38331158, v0
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX942-IR-NEXT:    v_fmamk_f32 v0, v2, 0xc6b6fe00, v0
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v0|, s3
+; GFX942-IR-NEXT:    s_movk_i32 s3, 0x5b7f
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v3, vcc
+; GFX942-IR-NEXT:    v_mul_lo_u32 v0, v0, s3
+; GFX942-IR-NEXT:    v_sub_u32_e32 v0, s2, v0
+; GFX942-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-IR-NEXT:    global_store_dwordx2 v1, v[0:1], s[0:1]
+; GFX942-IR-NEXT:    s_endpgm
   %x.shr = lshr i64 %x, 40
   %result = urem i64 %x.shr, 23423
   store i64 %result, ptr addrspace(1) %out
@@ -1481,6 +2842,48 @@ define i64 @v_test_urem24_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 24, v0
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_urem24_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v0
+; GFX942-NEXT:    s_mov_b32 s0, 0x41c00000
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX942-NEXT:    v_fma_f32 v2, -v2, v1, s0
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX942-NEXT:    v_sub_u32_e32 v0, 24, v0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_urem24_k_num_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v1, v0
+; GFX942-IR-NEXT:    s_mov_b32 s0, 0x41c00000
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, 0x41c00000, v2
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX942-IR-NEXT:    v_fma_f32 v2, -v2, v1, s0
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v1
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX942-IR-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX942-IR-NEXT:    v_sub_u32_e32 v0, 24, v0
+; GFX942-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = urem i64 24, %x.shr
   ret i64 %result
@@ -1524,6 +2927,48 @@ define i64 @v_test_urem24_pow2_k_num_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_sub_i32_e32 v0, vcc, 0x8000, v0
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_urem24_pow2_k_num_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX942-NEXT:    v_cvt_f32_u32_e32 v1, v0
+; GFX942-NEXT:    s_mov_b32 s0, 0x47000000
+; GFX942-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mul_f32_e32 v2, 0x47000000, v2
+; GFX942-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX942-NEXT:    v_fma_f32 v2, -v2, v1, s0
+; GFX942-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX942-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX942-NEXT:    v_sub_u32_e32 v0, 0x8000, v0
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_urem24_pow2_k_num_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v1, v0
+; GFX942-IR-NEXT:    s_mov_b32 s0, 0x47000000
+; GFX942-IR-NEXT:    v_rcp_iflag_f32_e32 v2, v1
+; GFX942-IR-NEXT:    s_nop 0
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, 0x47000000, v2
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX942-IR-NEXT:    v_fma_f32 v2, -v2, v1, s0
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v2|, v1
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX942-IR-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX942-IR-NEXT:    v_sub_u32_e32 v0, 0x8000, v0
+; GFX942-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = urem i64 32768, %x.shr
   ret i64 %result
@@ -1554,6 +2999,32 @@ define i64 @v_test_urem24_pow2_k_den_i64(i64 %x) {
 ; GCN-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GCN-IR-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-IR-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: v_test_urem24_pow2_k_den_i64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_bfe_u32 v0, v1, 8, 15
+; GFX942-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-IR-LABEL: v_test_urem24_pow2_k_den_i64:
+; GFX942-IR:       ; %bb.0:
+; GFX942-IR-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-IR-NEXT:    v_lshrrev_b32_e32 v0, 8, v1
+; GFX942-IR-NEXT:    v_cvt_f32_u32_e32 v1, v0
+; GFX942-IR-NEXT:    s_mov_b32 s0, 0x47000000
+; GFX942-IR-NEXT:    v_mul_f32_e32 v2, 0x38000000, v1
+; GFX942-IR-NEXT:    v_trunc_f32_e32 v2, v2
+; GFX942-IR-NEXT:    v_cvt_u32_f32_e32 v3, v2
+; GFX942-IR-NEXT:    v_fmamk_f32 v1, v2, 0xc7000000, v1
+; GFX942-IR-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, s0
+; GFX942-IR-NEXT:    s_nop 1
+; GFX942-IR-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v3, vcc
+; GFX942-IR-NEXT:    v_lshlrev_b32_e32 v1, 15, v1
+; GFX942-IR-NEXT:    v_sub_u32_e32 v0, v0, v1
+; GFX942-IR-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX942-IR-NEXT:    v_mov_b32_e32 v1, 0
+; GFX942-IR-NEXT:    s_setpc_b64 s[30:31]
   %x.shr = lshr i64 %x, 40
   %result = urem i64 %x.shr, 32768
   ret i64 %result
diff --git a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll
index d496634ae474f..2e84b215de3d9 100644
--- a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; Ensure that range metadata is handled correctly for vector loads.
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck --check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck --check-prefix=GFX942 %s
 
 define <2 x i16> @test_add2x16(ptr %a_ptr, ptr %b_ptr) {
 ; CHECK-LABEL: test_add2x16:
@@ -8,6 +9,17 @@ define <2 x i16> @test_add2x16(ptr %a_ptr, ptr %b_ptr) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_mov_b32_e32 v0, 0x300030
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_add2x16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0x300030
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: test_add2x16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x300030
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %a = load <2 x i16>, ptr %a_ptr, !range !0, !noundef !{}
   %b = load <2 x i16>, ptr %b_ptr, !range !1, !noundef !{}
   %result = add <2 x i16> %a, %b
@@ -24,6 +36,25 @@ define <2 x i32> @test_add2x32(ptr %a_ptr, ptr %b_ptr) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_or_b32_e32 v0, v5, v4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_add2x32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dword v4, v[2:3]
+; GFX9-NEXT:    flat_load_dword v5, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 48
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_or_b32_e32 v0, v5, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: test_add2x32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    flat_load_dword v4, v[2:3]
+; GFX942-NEXT:    flat_load_dword v5, v[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v1, 48
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_or_b32_e32 v0, v5, v4
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %a = load <2 x i32>, ptr %a_ptr, !range !2, !noundef !{}
   %b = load <2 x i32>, ptr %b_ptr, !range !3, !noundef !{}
   %result = add <2 x i32> %a, %b
@@ -45,6 +76,35 @@ define <2 x i64> @test_add2x64(ptr %a_ptr, ptr %b_ptr) {
 ; CHECK-NEXT:    v_or_b32_e32 v1, v5, v7
 ; CHECK-NEXT:    v_or_b32_e32 v0, v4, v6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_add2x64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx4 v[6:9], v[2:3]
+; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
+; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 48
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_or_b32_e32 v1, v5, v7
+; GFX9-NEXT:    v_or_b32_e32 v0, v4, v6
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: test_add2x64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    flat_load_dwordx4 v[6:9], v[2:3]
+; GFX942-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
+; GFX942-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX942-NEXT:    v_mov_b32_e32 v2, 48
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_or_b32_e32 v1, v5, v7
+; GFX942-NEXT:    v_or_b32_e32 v0, v4, v6
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %a = load <2 x i64>, ptr %a_ptr, !range !4, !noundef !{}
   %b = load <2 x i64>, ptr %b_ptr, !range !5, !noundef !{}
   %result = add <2 x i64> %a, %b
@@ -61,6 +121,25 @@ define <3 x i16> @test_add3x16(ptr %a_ptr, ptr %b_ptr) {
 ; CHECK-NEXT:    v_or_b32_e32 v1, v5, v7
 ; CHECK-NEXT:    v_or_b32_e32 v0, v4, v6
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_add3x16:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX9-NEXT:    flat_load_dwordx2 v[6:7], v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_or_b32_e32 v1, v5, v7
+; GFX9-NEXT:    v_or_b32_e32 v0, v4, v6
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: test_add3x16:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX942-NEXT:    flat_load_dwordx2 v[6:7], v[2:3]
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_or_b32_e32 v1, v5, v7
+; GFX942-NEXT:    v_or_b32_e32 v0, v4, v6
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %a = load <3 x i16>, ptr %a_ptr, !range !0, !noundef !{}
   %b = load <3 x i16>, ptr %b_ptr, !range !1, !noundef !{}
   %result = add <3 x i16> %a, %b
@@ -78,6 +157,27 @@ define <3 x i32> @test_add3x32(ptr %a_ptr, ptr %b_ptr) {
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    v_or_b32_e32 v0, v5, v4
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_add3x32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dword v4, v[2:3]
+; GFX9-NEXT:    flat_load_dword v5, v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, 48
+; GFX9-NEXT:    v_mov_b32_e32 v2, 48
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_or_b32_e32 v0, v5, v4
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: test_add3x32:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    flat_load_dword v4, v[2:3]
+; GFX942-NEXT:    flat_load_dword v5, v[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v1, 48
+; GFX942-NEXT:    v_mov_b32_e32 v2, 48
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_or_b32_e32 v0, v5, v4
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %a = load <3 x i32>, ptr %a_ptr, !range !2, !noundef !{}
   %b = load <3 x i32>, ptr %b_ptr, !range !3, !noundef !{}
   %result = add <3 x i32> %a, %b
@@ -101,6 +201,39 @@ define <3 x i64> @test_add3x64(ptr %a_ptr, ptr %b_ptr) {
 ; CHECK-NEXT:    v_mov_b32_e32 v4, 48
 ; CHECK-NEXT:    v_mov_b32_e32 v5, 0
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: test_add3x64:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx4 v[6:9], v[2:3]
+; GFX9-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
+; GFX9-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX9-NEXT:    v_mov_b32_e32 v2, 48
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_or_b32_e32 v1, v5, v7
+; GFX9-NEXT:    v_or_b32_e32 v0, v4, v6
+; GFX9-NEXT:    v_mov_b32_e32 v4, 48
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX942-LABEL: test_add3x64:
+; GFX942:       ; %bb.0:
+; GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    flat_load_dwordx4 v[4:7], v[0:1]
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    flat_load_dwordx4 v[6:9], v[2:3]
+; GFX942-NEXT:    ; kill: killed $vgpr2 killed $vgpr3
+; GFX942-NEXT:    ; kill: killed $vgpr0 killed $vgpr1
+; GFX942-NEXT:    v_mov_b32_e32 v2, 48
+; GFX942-NEXT:    v_mov_b32_e32 v3, 0
+; GFX942-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX942-NEXT:    v_or_b32_e32 v1, v5, v7
+; GFX942-NEXT:    v_or_b32_e32 v0, v4, v6
+; GFX942-NEXT:    v_mov_b32_e32 v4, 48
+; GFX942-NEXT:    v_mov_b32_e32 v5, 0
+; GFX942-NEXT:    s_setpc_b64 s[30:31]
   %a = load <3 x i64>, ptr %a_ptr, !range !4, !noundef !{}
   %b = load <3 x i64>, ptr %b_ptr, !range !5, !noundef !{}
   %result = add <3 x i64> %a, %b
diff --git a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll
index ddb6afa34ab22..71ed94b3559b5 100644
--- a/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll
+++ b/llvm/test/CodeGen/AMDGPU/waterfall_kills_scc.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
 ; RUN:  llc -mtriple=amdgcn -mcpu=gfx906 < %s | FileCheck -check-prefix=GFX906 %s
+; RUN:  llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
 declare float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32>, i32, i32, i32 immarg) #0
 declare void @llvm.amdgcn.raw.buffer.store.f32(float, <4 x i32>, i32, i32, i32 immarg) #1
 
@@ -68,6 +69,52 @@ define amdgpu_kernel void  @foo(i1 %cmp1) {
 ; GFX906-NEXT:    s_mov_b32 s7, s4
 ; GFX906-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX906-NEXT:    s_endpgm
+;
+; GFX942-LABEL: foo:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    s_mov_b32 s8, 0
+; GFX942-NEXT:    scratch_load_dwordx4 v[2:5], off, s8
+; GFX942-NEXT:    s_load_dword s2, s[4:5], 0x24
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x1c
+; GFX942-NEXT:    v_bfe_u32 v1, v0, 10, 10
+; GFX942-NEXT:    v_and_b32_e32 v6, 0x3ff, v0
+; GFX942-NEXT:    v_bfe_u32 v0, v0, 20, 10
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_bitcmp1_b32 s2, 0
+; GFX942-NEXT:    s_mul_i32 s0, s0, s1
+; GFX942-NEXT:    v_mul_u32_u24_e32 v1, s1, v1
+; GFX942-NEXT:    v_mad_u32_u24 v1, s0, v6, v1
+; GFX942-NEXT:    v_mov_b32_e32 v6, 0
+; GFX942-NEXT:    v_add_lshl_u32 v0, v1, v0, 4
+; GFX942-NEXT:    v_mov_b32_e32 v7, v6
+; GFX942-NEXT:    s_cselect_b32 s4, 1, 0
+; GFX942-NEXT:    s_mov_b64 s[2:3], exec
+; GFX942-NEXT:    ds_write_b64 v0, v[6:7]
+; GFX942-NEXT:  .LBB0_1: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_waitcnt vmcnt(0)
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v2
+; GFX942-NEXT:    v_readfirstlane_b32 s1, v3
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cmp_eq_u64_e32 vcc, s[0:1], v[2:3]
+; GFX942-NEXT:    v_readfirstlane_b32 s0, v4
+; GFX942-NEXT:    v_readfirstlane_b32 s1, v5
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cmp_eq_u64_e64 s[0:1], s[0:1], v[4:5]
+; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    s_and_saveexec_b64 s[0:1], s[0:1]
+; GFX942-NEXT:    ; implicit-def: $vgpr2_vgpr3_vgpr4_vgpr5
+; GFX942-NEXT:    s_xor_b64 exec, exec, s[0:1]
+; GFX942-NEXT:    s_cbranch_execnz .LBB0_1
+; GFX942-NEXT:  ; %bb.2:
+; GFX942-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX942-NEXT:    s_mov_b64 exec, s[2:3]
+; GFX942-NEXT:    s_cselect_b32 s9, 0x3ff00000, 0
+; GFX942-NEXT:    v_cvt_f32_f64_e32 v0, s[8:9]
+; GFX942-NEXT:    s_mov_b32 s9, s8
+; GFX942-NEXT:    s_mov_b32 s10, s8
+; GFX942-NEXT:    s_mov_b32 s11, s8
+; GFX942-NEXT:    buffer_store_dword v0, off, s[8:11], 0
+; GFX942-NEXT:    s_endpgm
 entry:
   %wbr = alloca <4 x i32>, align 16, addrspace(5)
   store ptr null, ptr addrspace(5) %wbr, align 16
diff --git a/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll b/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
index 1a8f198ecf70a..3d8ddc35833e9 100644
--- a/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
+++ b/llvm/test/CodeGen/AMDGPU/widen-vselect-and-mask.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -enable-var-scope -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -enable-var-scope -check-prefix=GFX942 %s
 
 ; Check that DAGTypeLegalizer::WidenVSELECTAndMask doesn't try to
 ; create vselects with i64 condition masks.
@@ -32,6 +33,31 @@ define amdgpu_kernel void @widen_vselect_and_mask_v4f64(<4 x double> %arg) #0 {
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: widen_vselect_and_mask_v4f64:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v2, 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, v2
+; GFX942-NEXT:    v_mov_b32_e32 v4, v2
+; GFX942-NEXT:    v_mov_b32_e32 v5, v2
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    v_cmp_u_f64_e64 s[0:1], s[8:9], s[8:9]
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    v_cmp_neq_f64_e64 s[0:1], s[8:9], 0
+; GFX942-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[0:1]
+; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    s_and_b64 s[0:1], s[0:1], exec
+; GFX942-NEXT:    s_cselect_b32 s0, 0x3ff00000, 0
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 16
+; GFX942-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX942-NEXT:    s_nop 0
+; GFX942-NEXT:    v_mov_b32_e32 v3, s0
+; GFX942-NEXT:    global_store_dwordx4 v[0:1], v[2:5], off
+; GFX942-NEXT:    s_endpgm
 bb:
   %tmp = extractelement <4 x double> %arg, i64 0
   %tmp1 = fcmp uno double %tmp, 0.000000e+00
@@ -76,6 +102,33 @@ define amdgpu_kernel void @widen_vselect_and_mask_v4i64(<4 x i64> %arg) #0 {
 ; GCN-NEXT:    s_mov_b32 s7, s11
 ; GCN-NEXT:    buffer_store_dwordx4 v[4:7], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
+;
+; GFX942-LABEL: widen_vselect_and_mask_v4i64:
+; GFX942:       ; %bb.0: ; %bb
+; GFX942-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v4, 0
+; GFX942-NEXT:    v_mov_b32_e32 v5, v4
+; GFX942-NEXT:    v_mov_b32_e32 v6, v4
+; GFX942-NEXT:    v_mov_b32_e32 v7, v4
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_cmp_eq_u64 s[8:9], 0
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, -1, s[0:1]
+; GFX942-NEXT:    s_cmp_lg_u64 s[8:9], 0
+; GFX942-NEXT:    v_mov_b32_e32 v1, v0
+; GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GFX942-NEXT:    v_cmp_lt_i64_e32 vcc, -1, v[0:1]
+; GFX942-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
+; GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
+; GFX942-NEXT:    v_mov_b64_e32 v[2:3], 16
+; GFX942-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX942-NEXT:    global_store_dwordx4 v[2:3], v[4:7], off
+; GFX942-NEXT:    v_mov_b32_e32 v1, v4
+; GFX942-NEXT:    v_mov_b32_e32 v2, v4
+; GFX942-NEXT:    v_mov_b32_e32 v3, v4
+; GFX942-NEXT:    v_mov_b64_e32 v[4:5], 0
+; GFX942-NEXT:    global_store_dwordx4 v[4:5], v[0:3], off
+; GFX942-NEXT:    s_endpgm
 bb:
   %tmp = extractelement <4 x i64> %arg, i64 0
   %tmp1 = icmp eq i64 %tmp, 0



More information about the llvm-commits mailing list