[llvm] a2dc4e0 - [AMDGPU] Enable multi-group xnack replay in hardware (GFX1250) (#169016)

via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 21 06:12:27 PST 2025


Author: Christudasan Devadasan
Date: 2025-11-21T19:42:17+05:30
New Revision: a2dc4e02e7ba77ddcb0afca0304535d8f142c98b

URL: https://github.com/llvm/llvm-project/commit/a2dc4e02e7ba77ddcb0afca0304535d8f142c98b
DIFF: https://github.com/llvm/llvm-project/commit/a2dc4e02e7ba77ddcb0afca0304535d8f142c98b.diff

LOG: [AMDGPU] Enable multi-group xnack replay in hardware (GFX1250) (#169016)

This patch enables the multi-group xnack replay mode by
configuring the hardware MODE register at kernel entry.
This aligns the hardware behavior with the compiler's
existing multi-group s_wait_xcnt insertion logic.

Added: 
    

Modified: 
    llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
    llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll
    llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
    llvm/test/CodeGen/AMDGPU/add-max.ll
    llvm/test/CodeGen/AMDGPU/add_u64.ll
    llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
    llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
    llvm/test/CodeGen/AMDGPU/bf16-math.ll
    llvm/test/CodeGen/AMDGPU/bf16.ll
    llvm/test/CodeGen/AMDGPU/bitop3.ll
    llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
    llvm/test/CodeGen/AMDGPU/calling-conventions.ll
    llvm/test/CodeGen/AMDGPU/carryout-selection.ll
    llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
    llvm/test/CodeGen/AMDGPU/ds_read2-gfx1250.ll
    llvm/test/CodeGen/AMDGPU/ds_write2.ll
    llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll
    llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
    llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
    llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
    llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
    llvm/test/CodeGen/AMDGPU/fmax3.ll
    llvm/test/CodeGen/AMDGPU/fmin3.ll
    llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll
    llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
    llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
    llvm/test/CodeGen/AMDGPU/global-address.ll
    llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
    llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
    llvm/test/CodeGen/AMDGPU/literal64.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.f16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.pk.bf16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
    llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.index.gfx1250.w32.ll
    llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll
    llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
    llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll
    llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
    llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
    llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
    llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
    llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
    llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
    llvm/test/CodeGen/AMDGPU/mad_64_32.ll
    llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
    llvm/test/CodeGen/AMDGPU/max.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
    llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
    llvm/test/CodeGen/AMDGPU/min.ll
    llvm/test/CodeGen/AMDGPU/minmax.ll
    llvm/test/CodeGen/AMDGPU/mul.ll
    llvm/test/CodeGen/AMDGPU/packed-fp32.ll
    llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
    llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
    llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
    llvm/test/CodeGen/AMDGPU/s-cluster-barrier.ll
    llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
    llvm/test/CodeGen/AMDGPU/scale-offset-global.ll
    llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll
    llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
    llvm/test/CodeGen/AMDGPU/sub_u64.ll
    llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll
    llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
    llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
    llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index aa5ea77f17291..ffbb111d42221 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -772,6 +772,17 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
                                          PreloadedScratchRsrcReg,
                                          ScratchRsrcReg, ScratchWaveOffsetReg);
   }
+
+  if (ST.hasWaitXCnt()) {
+    // Set REPLAY_MODE (bit 25) in MODE register to enable multi-group XNACK
+    // replay. This aligns hardware behavior with the compiler's s_wait_xcnt
+    // insertion logic, which assumes multi-group mode by default.
+    unsigned RegEncoding =
+        AMDGPU::Hwreg::HwregEncoding::encode(AMDGPU::Hwreg::ID_MODE, 25, 1);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SETREG_IMM32_B32))
+        .addImm(1)
+        .addImm(RegEncoding);
+  }
 }
 
 // Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index b5d593a9c15ed..58586129fb4e9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -41,6 +41,7 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_add_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -71,6 +72,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, doub
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -114,6 +116,7 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -160,6 +163,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8)
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -190,6 +194,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -233,6 +238,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -279,6 +285,7 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_add_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -309,6 +316,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, d
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -352,6 +360,7 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -397,6 +406,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -427,6 +437,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inr
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -470,6 +481,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -515,6 +527,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_min_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -545,6 +558,7 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -588,6 +602,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -634,6 +649,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -664,6 +680,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -707,6 +724,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -753,6 +771,7 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_min_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -783,6 +802,7 @@ define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, d
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -826,6 +846,7 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -871,6 +892,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -901,6 +923,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inr
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -944,6 +967,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -989,6 +1013,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_max_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1019,6 +1044,7 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, doub
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -1062,6 +1088,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1108,6 +1135,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1138,6 +1166,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -1181,6 +1210,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1227,6 +1257,7 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_max_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1257,6 +1288,7 @@ define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, d
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -1300,6 +1332,7 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1345,6 +1378,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1375,6 +1409,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inr
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -1418,6 +1453,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1486,6 +1522,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
 ;
 ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -1558,6 +1595,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
 ;
 ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -1632,6 +1670,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
 ;
 ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_system:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -1704,6 +1743,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
 ;
 ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_flush:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -1894,6 +1934,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
 ;
 ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -1949,6 +1990,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
 ;
 ; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -1991,6 +2033,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
 ;
 ; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -2035,6 +2078,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
 ;
 ; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_system:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -2197,6 +2241,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
 ;
 ; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -2255,6 +2300,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
 ;
 ; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -2320,6 +2366,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
 ;
 ; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0
@@ -2385,6 +2432,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
 ;
 ; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 s0, exec_lo
 ; GFX1250-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1250-NEXT:    v_mbcnt_lo_u32_b32 v0, s0, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
index 6facdfdec64ae..d16dc348209e2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.abs.ll
@@ -34,6 +34,7 @@ define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) {
 ;
 ; GFX1250-LABEL: abs_sgpr_i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_sext_i32_i16 s0, s0
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_abs_i32 s0, s0
@@ -43,10 +44,26 @@ define amdgpu_cs i16 @abs_sgpr_i16(i16 inreg %arg) {
 }
 
 define amdgpu_cs i32 @abs_sgpr_i32(i32 inreg %arg) {
-; GFX-LABEL: abs_sgpr_i32:
-; GFX:       ; %bb.0:
-; GFX-NEXT:    s_abs_i32 s0, s0
-; GFX-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: abs_sgpr_i32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_abs_i32 s0, s0
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_abs_i32 s0, s0
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_abs_i32 s0, s0
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_abs_i32 s0, s0
+; GFX1250-NEXT:    ; return to shader part epilog
   %res = call i32 @llvm.abs.i32(i32 %arg, i1 false)
   ret i32 %res
 }
@@ -81,6 +98,7 @@ define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
 ;
 ; GFX1250-LABEL: abs_sgpr_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_ashr_i32 s2, s1, 31
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mov_b32 s3, s2
@@ -93,13 +111,38 @@ define amdgpu_cs i64 @abs_sgpr_i64(i64 inreg %arg) {
 }
 
 define amdgpu_cs <4 x i32> @abs_sgpr_v4i32(<4 x i32> inreg %arg) {
-; GFX-LABEL: abs_sgpr_v4i32:
-; GFX:       ; %bb.0:
-; GFX-NEXT:    s_abs_i32 s0, s0
-; GFX-NEXT:    s_abs_i32 s1, s1
-; GFX-NEXT:    s_abs_i32 s2, s2
-; GFX-NEXT:    s_abs_i32 s3, s3
-; GFX-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: abs_sgpr_v4i32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_abs_i32 s0, s0
+; GFX6-NEXT:    s_abs_i32 s1, s1
+; GFX6-NEXT:    s_abs_i32 s2, s2
+; GFX6-NEXT:    s_abs_i32 s3, s3
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_v4i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_abs_i32 s0, s0
+; GFX8-NEXT:    s_abs_i32 s1, s1
+; GFX8-NEXT:    s_abs_i32 s2, s2
+; GFX8-NEXT:    s_abs_i32 s3, s3
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_v4i32:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_abs_i32 s0, s0
+; GFX10-NEXT:    s_abs_i32 s1, s1
+; GFX10-NEXT:    s_abs_i32 s2, s2
+; GFX10-NEXT:    s_abs_i32 s3, s3
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_v4i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_abs_i32 s0, s0
+; GFX1250-NEXT:    s_abs_i32 s1, s1
+; GFX1250-NEXT:    s_abs_i32 s2, s2
+; GFX1250-NEXT:    s_abs_i32 s3, s3
+; GFX1250-NEXT:    ; return to shader part epilog
   %res = call <4 x i32> @llvm.abs.v4i32(<4 x i32> %arg, i1 false)
   ret <4 x i32> %res
 }
@@ -278,13 +321,38 @@ define <4 x i32> @abs_vgpr_v4i32(<4 x i32> %arg) {
 }
 
 define amdgpu_cs <2 x i8> @abs_sgpr_v2i8(<2 x i8> inreg %arg) {
-; GFX-LABEL: abs_sgpr_v2i8:
-; GFX:       ; %bb.0:
-; GFX-NEXT:    s_sext_i32_i8 s0, s0
-; GFX-NEXT:    s_sext_i32_i8 s1, s1
-; GFX-NEXT:    s_abs_i32 s0, s0
-; GFX-NEXT:    s_abs_i32 s1, s1
-; GFX-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: abs_sgpr_v2i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_sext_i32_i8 s0, s0
+; GFX6-NEXT:    s_sext_i32_i8 s1, s1
+; GFX6-NEXT:    s_abs_i32 s0, s0
+; GFX6-NEXT:    s_abs_i32 s1, s1
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_v2i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_sext_i32_i8 s0, s0
+; GFX8-NEXT:    s_sext_i32_i8 s1, s1
+; GFX8-NEXT:    s_abs_i32 s0, s0
+; GFX8-NEXT:    s_abs_i32 s1, s1
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_v2i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_sext_i32_i8 s0, s0
+; GFX10-NEXT:    s_sext_i32_i8 s1, s1
+; GFX10-NEXT:    s_abs_i32 s0, s0
+; GFX10-NEXT:    s_abs_i32 s1, s1
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_v2i8:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_sext_i32_i8 s0, s0
+; GFX1250-NEXT:    s_sext_i32_i8 s1, s1
+; GFX1250-NEXT:    s_abs_i32 s0, s0
+; GFX1250-NEXT:    s_abs_i32 s1, s1
+; GFX1250-NEXT:    ; return to shader part epilog
   %res = call <2 x i8> @llvm.abs.v2i8(<2 x i8> %arg, i1 false)
   ret <2 x i8> %res
 }
@@ -340,15 +408,46 @@ define <2 x i8> @abs_vgpr_v2i8(<2 x i8> %arg) {
 }
 
 define amdgpu_cs <3 x i8> @abs_sgpr_v3i8(<3 x i8> inreg %arg) {
-; GFX-LABEL: abs_sgpr_v3i8:
-; GFX:       ; %bb.0:
-; GFX-NEXT:    s_sext_i32_i8 s0, s0
-; GFX-NEXT:    s_sext_i32_i8 s1, s1
-; GFX-NEXT:    s_sext_i32_i8 s2, s2
-; GFX-NEXT:    s_abs_i32 s0, s0
-; GFX-NEXT:    s_abs_i32 s1, s1
-; GFX-NEXT:    s_abs_i32 s2, s2
-; GFX-NEXT:    ; return to shader part epilog
+; GFX6-LABEL: abs_sgpr_v3i8:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_sext_i32_i8 s0, s0
+; GFX6-NEXT:    s_sext_i32_i8 s1, s1
+; GFX6-NEXT:    s_sext_i32_i8 s2, s2
+; GFX6-NEXT:    s_abs_i32 s0, s0
+; GFX6-NEXT:    s_abs_i32 s1, s1
+; GFX6-NEXT:    s_abs_i32 s2, s2
+; GFX6-NEXT:    ; return to shader part epilog
+;
+; GFX8-LABEL: abs_sgpr_v3i8:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_sext_i32_i8 s0, s0
+; GFX8-NEXT:    s_sext_i32_i8 s1, s1
+; GFX8-NEXT:    s_sext_i32_i8 s2, s2
+; GFX8-NEXT:    s_abs_i32 s0, s0
+; GFX8-NEXT:    s_abs_i32 s1, s1
+; GFX8-NEXT:    s_abs_i32 s2, s2
+; GFX8-NEXT:    ; return to shader part epilog
+;
+; GFX10-LABEL: abs_sgpr_v3i8:
+; GFX10:       ; %bb.0:
+; GFX10-NEXT:    s_sext_i32_i8 s0, s0
+; GFX10-NEXT:    s_sext_i32_i8 s1, s1
+; GFX10-NEXT:    s_sext_i32_i8 s2, s2
+; GFX10-NEXT:    s_abs_i32 s0, s0
+; GFX10-NEXT:    s_abs_i32 s1, s1
+; GFX10-NEXT:    s_abs_i32 s2, s2
+; GFX10-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: abs_sgpr_v3i8:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_sext_i32_i8 s0, s0
+; GFX1250-NEXT:    s_sext_i32_i8 s1, s1
+; GFX1250-NEXT:    s_sext_i32_i8 s2, s2
+; GFX1250-NEXT:    s_abs_i32 s0, s0
+; GFX1250-NEXT:    s_abs_i32 s1, s1
+; GFX1250-NEXT:    s_abs_i32 s2, s2
+; GFX1250-NEXT:    ; return to shader part epilog
   %res = call <3 x i8> @llvm.abs.v3i8(<3 x i8> %arg, i1 false)
   ret <3 x i8> %res
 }
@@ -446,6 +545,7 @@ define amdgpu_cs <2 x i16> @abs_sgpr_v2i16(<2 x i16> inreg %arg) {
 ;
 ; GFX1250-LABEL: abs_sgpr_v2i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_sext_i32_i16 s1, s0
 ; GFX1250-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX1250-NEXT:    s_abs_i32 s1, s1
@@ -536,6 +636,7 @@ define amdgpu_cs <3 x i16> @abs_sgpr_v3i16(<3 x i16> inreg %arg) {
 ;
 ; GFX1250-LABEL: abs_sgpr_v3i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_sext_i32_i16 s2, s0
 ; GFX1250-NEXT:    s_ashr_i32 s0, s0, 16
 ; GFX1250-NEXT:    s_abs_i32 s2, s2
@@ -598,3 +699,5 @@ define <3 x i16> @abs_vgpr_v3i16(<3 x i16> %arg) {
   %res = call <3 x i16> @llvm.abs.v3i16(<3 x i16> %arg, i1 false)
   ret <3 x i16> %res
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index efa51ead0d196..1de5e136c400d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -930,6 +930,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
 ;
 ; GFX1250-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
 ; GFX1250-UNALIGNED:       ; %bb.0:
+; GFX1250-UNALIGNED-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-UNALIGNED-NEXT:    global_load_b96 v[0:2], v0, s[0:1]
 ; GFX1250-UNALIGNED-NEXT:    s_wait_loadcnt 0x0
@@ -940,6 +941,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
 ;
 ; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
 ; GFX1250-NOUNALIGNED:       ; %bb.0:
+; GFX1250-NOUNALIGNED-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOUNALIGNED-NEXT:    s_clause 0xb
 ; GFX1250-NOUNALIGNED-NEXT:    s_load_u8 s2, s[0:1], 0x1
 ; GFX1250-NOUNALIGNED-NEXT:    s_load_u8 s3, s[0:1], 0x3
@@ -1208,6 +1210,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
 ;
 ; GFX1250-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
 ; GFX1250-UNALIGNED:       ; %bb.0:
+; GFX1250-UNALIGNED-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-UNALIGNED-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-UNALIGNED-NEXT:    global_load_b96 v[0:2], v0, s[0:1]
 ; GFX1250-UNALIGNED-NEXT:    s_wait_loadcnt 0x0
@@ -1218,6 +1221,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
 ;
 ; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
 ; GFX1250-NOUNALIGNED:       ; %bb.0:
+; GFX1250-NOUNALIGNED-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOUNALIGNED-NEXT:    s_clause 0x5
 ; GFX1250-NOUNALIGNED-NEXT:    s_load_u16 s2, s[0:1], 0x2
 ; GFX1250-NOUNALIGNED-NEXT:    s_load_u16 s3, s[0:1], 0x6
@@ -1362,6 +1366,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align4(ptr addrspace(4) inreg
 ;
 ; GFX1250-LABEL: s_load_constant_v3i32_align4:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 s4, s0
 ; GFX1250-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
@@ -1413,6 +1418,7 @@ define amdgpu_ps i96 @s_load_constant_i96_align8(ptr addrspace(4) inreg %ptr) {
 ;
 ; GFX1250-LABEL: s_load_constant_i96_align8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 s4, s0
 ; GFX1250-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
@@ -1464,6 +1470,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align8(ptr addrspace(4) inreg
 ;
 ; GFX1250-LABEL: s_load_constant_v3i32_align8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 s4, s0
 ; GFX1250-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
@@ -1515,6 +1522,7 @@ define amdgpu_ps <3 x i32> @s_load_constant_v6i16_align8(ptr addrspace(4) inreg
 ;
 ; GFX1250-LABEL: s_load_constant_v6i16_align8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 s4, s0
 ; GFX1250-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
@@ -1593,6 +1601,7 @@ define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg
 ;
 ; GFX1250-LABEL: s_load_constant_v12i8_align8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 s4, s0
 ; GFX1250-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
@@ -1670,11 +1679,24 @@ define amdgpu_ps <12 x i8> @s_load_constant_v12i8_align8(ptr addrspace(4) inreg
 }
 
 define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align16(ptr addrspace(4) inreg %ptr) {
-; GFX12-LABEL: s_load_constant_v3i32_align16:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
-; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    ; return to shader part epilog
+; GFX12-UNALIGNED-LABEL: s_load_constant_v3i32_align16:
+; GFX12-UNALIGNED:       ; %bb.0:
+; GFX12-UNALIGNED-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-UNALIGNED-NEXT:    s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT:    ; return to shader part epilog
+;
+; GFX12-NOUNALIGNED-LABEL: s_load_constant_v3i32_align16:
+; GFX12-NOUNALIGNED:       ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX12-NOUNALIGNED-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: s_load_constant_v3i32_align16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
 ;
 ; GCN-LABEL: s_load_constant_v3i32_align16:
 ; GCN:       ; %bb.0:
@@ -1684,3 +1706,5 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align16(ptr addrspace(4) inreg
   %load = load <3 x i32>, ptr addrspace(4) %ptr, align 16
   ret <3 x i32> %load
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX12: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll
index 43c8f46f98cfc..62b8b55072089 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/minmaxabs-i64.ll
@@ -132,6 +132,7 @@ define i64 @test_abs_i64(i64 %a) {
 define amdgpu_ps i64 @test_umin_i64_s(i64 inreg %a, i64 inreg %b) {
 ; CHECK-LABEL: test_umin_i64_s:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-NEXT:    v_min_u64 v[0:1], s[0:1], s[2:3]
 ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -144,6 +145,7 @@ define amdgpu_ps i64 @test_umin_i64_s(i64 inreg %a, i64 inreg %b) {
 define amdgpu_ps i64 @test_umax_i64_s(i64 inreg %a, i64 inreg %b) {
 ; CHECK-LABEL: test_umax_i64_s:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-NEXT:    v_max_u64 v[0:1], s[0:1], s[2:3]
 ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -156,6 +158,7 @@ define amdgpu_ps i64 @test_umax_i64_s(i64 inreg %a, i64 inreg %b) {
 define amdgpu_ps i64 @test_smin_i64_s(i64 inreg %a, i64 inreg %b) {
 ; CHECK-LABEL: test_smin_i64_s:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-NEXT:    v_min_i64 v[0:1], s[0:1], s[2:3]
 ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -168,6 +171,7 @@ define amdgpu_ps i64 @test_smin_i64_s(i64 inreg %a, i64 inreg %b) {
 define amdgpu_ps i64 @test_smax_i64_s(i64 inreg %a, i64 inreg %b) {
 ; CHECK-LABEL: test_smax_i64_s:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-NEXT:    v_max_i64 v[0:1], s[0:1], s[2:3]
 ; CHECK-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -180,6 +184,7 @@ define amdgpu_ps i64 @test_smax_i64_s(i64 inreg %a, i64 inreg %b) {
 define amdgpu_ps i64 @test_abs_i64_s(i64 inreg %a) {
 ; CHECK-LABEL: test_abs_i64_s:
 ; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-NEXT:    s_ashr_i32 s2, s1, 31
 ; CHECK-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; CHECK-NEXT:    s_mov_b32 s3, s2

diff  --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
index 3eecaccf0308f..34e4931674cce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mul.ll
@@ -26,6 +26,7 @@ define amdgpu_ps i16 @s_mul_i16(i16 inreg %num, i16 inreg %den) {
 ;
 ; GFX1250-LABEL: s_mul_i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX1250-NEXT:    ; return to shader part epilog
   %result = mul i16 %num, %den
@@ -125,6 +126,7 @@ define amdgpu_ps zeroext i16 @s_mul_i16_zeroext(i16 inreg zeroext %num, i16 inre
 ;
 ; GFX1250-LABEL: s_mul_i16_zeroext:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_and_b32 s0, 0xffff, s0
@@ -220,6 +222,7 @@ define amdgpu_ps signext i16 @s_mul_i16_signext(i16 inreg signext %num, i16 inre
 ;
 ; GFX1250-LABEL: s_mul_i16_signext:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_sext_i32_i16 s0, s0
@@ -315,6 +318,7 @@ define amdgpu_ps i32 @s_mul_i32(i32 inreg %num, i32 inreg %den) {
 ;
 ; GFX1250-LABEL: s_mul_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX1250-NEXT:    ; return to shader part epilog
   %result = mul i32 %num, %den
@@ -375,6 +379,7 @@ define amdgpu_ps <2 x i32> @s_mul_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %d
 ;
 ; GFX1250-LABEL: s_mul_v2i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mul_i32 s0, s0, s2
 ; GFX1250-NEXT:    s_mul_i32 s1, s1, s3
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -474,6 +479,7 @@ define amdgpu_cs i33 @s_mul_i33(i33 inreg %num,  i33 inreg %den) {
 ;
 ; GFX1250-LABEL: s_mul_i33:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mul_u64 s[0:1], s[0:1], s[2:3]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %result = mul i33 %num, %den
@@ -535,6 +541,7 @@ define amdgpu_ps i64 @s_mul_i64(i64 inreg %num, i64 inreg %den) {
 ;
 ; GFX1250-LABEL: s_mul_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mul_u64 s[0:1], s[0:1], s[2:3]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %result = mul i64 %num, %den
@@ -712,6 +719,7 @@ define amdgpu_ps <3 x i32> @s_mul_i96(i96 inreg %num, i96 inreg %den) {
 ;
 ; GFX1250-LABEL: s_mul_i96:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mul_i32 s6, s0, s5
 ; GFX1250-NEXT:    s_mul_i32 s7, s1, s4
 ; GFX1250-NEXT:    s_mul_i32 s2, s2, s3
@@ -1027,6 +1035,7 @@ define amdgpu_ps <4 x i32> @s_mul_i128(i128 inreg %num, i128 inreg %den) {
 ;
 ; GFX1250-LABEL: s_mul_i128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mul_i32 s9, s0, s6
 ; GFX1250-NEXT:    s_mul_i32 s11, s1, s5
 ; GFX1250-NEXT:    s_mul_hi_u32 s10, s0, s6
@@ -2218,6 +2227,7 @@ define amdgpu_ps <8 x i32> @s_mul_i256(i256 inreg %num, i256 inreg %den) {
 ;
 ; GFX1250-LABEL: s_mul_i256:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mul_i32 s17, s0, s10
 ; GFX1250-NEXT:    s_mul_i32 s19, s1, s9
 ; GFX1250-NEXT:    s_mul_hi_u32 s18, s0, s10
@@ -2997,6 +3007,7 @@ define amdgpu_ps void @s_mul_u64_zext_with_vregs(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX1250-LABEL: s_mul_u64_zext_with_vregs:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_mad_nc_u64_u32 v[2:3], 0x50, v4, 0
@@ -3101,6 +3112,7 @@ define amdgpu_kernel void @s_mul_u64_zext_with_sregs(ptr addrspace(1) %out, ptr
 ;
 ; GFX1250-LABEL: s_mul_u64_zext_with_sregs:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -3191,6 +3203,7 @@ define amdgpu_ps void @s_mul_u64_sext_with_vregs(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX1250-LABEL: s_mul_u64_sext_with_vregs:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b32 v4, v[2:3], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_mad_nc_i64_i32 v[2:3], 0x50, v4, 0
@@ -3310,6 +3323,7 @@ define amdgpu_kernel void @s_mul_u64_sext_with_sregs(ptr addrspace(1) %out, ptr
 ;
 ; GFX1250-LABEL: s_mul_u64_sext_with_sregs:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/add-max.ll b/llvm/test/CodeGen/AMDGPU/add-max.ll
index c55137574a9a4..a1513663fb331 100644
--- a/llvm/test/CodeGen/AMDGPU/add-max.ll
+++ b/llvm/test/CodeGen/AMDGPU/add-max.ll
@@ -5,6 +5,7 @@
 define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN-LABEL: add_max_u32_vvv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_add_max_u32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
@@ -16,6 +17,7 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
 define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) {
 ; GCN-LABEL: add_max_u32_svv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_add_max_u32 v0, s0, v0, v1
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
@@ -27,6 +29,7 @@ define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) {
 define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
 ; GCN-LABEL: add_max_u32_ssv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_add_max_u32 v0, s0, s1, v0
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
@@ -38,6 +41,7 @@ define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
 define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
 ; SDAG-LABEL: add_max_u32_sss:
 ; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-NEXT:    v_add_nc_u32_e64 v0, s0, s1 clamp
 ; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-NEXT:    v_max_u32_e32 v0, s2, v0
@@ -45,6 +49,7 @@ define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c
 ;
 ; GISEL-LABEL: add_max_u32_sss:
 ; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-NEXT:    v_add_max_u32 v0, s0, s1, v0
@@ -58,6 +63,7 @@ define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c
 define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
 ; GCN-LABEL: add_max_u32_vsi:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_add_max_u32 v0, v0, s0, 4
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
@@ -69,6 +75,7 @@ define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
 define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) {
 ; GCN-LABEL: add_max_u32_svl:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_add_max_u32 v0, s0, v0, 0x64
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
@@ -80,6 +87,7 @@ define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) {
 define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b, i32 %c) {
 ; GCN-LABEL: add_max_u32_slv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_add_max_u32 v0, s0, v0, v1
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
@@ -91,6 +99,7 @@ define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b, i32 %c) {
 define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN-LABEL: add_max_i32_vvv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_add_max_i32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
@@ -102,6 +111,7 @@ define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
 define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN-LABEL: add_min_u32_vvv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_add_min_u32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
@@ -113,6 +123,7 @@ define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
 define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN-LABEL: add_min_i32_vvv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_add_min_i32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
@@ -124,6 +135,7 @@ define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
 define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
 ; GCN-LABEL: add_max_v2u16_vvv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_max_u16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
@@ -135,6 +147,7 @@ define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x i16> %c) {
 ; GCN-LABEL: add_max_v2u16_svv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_max_u16 v0, s0, v0, v1
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
@@ -146,6 +159,7 @@ define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x
 define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) {
 ; GCN-LABEL: add_max_v2u16_ssv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_max_u16 v0, s0, s1, v0
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
@@ -157,6 +171,7 @@ define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b
 define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) {
 ; SDAG-LABEL: add_max_v2u16_sss:
 ; SDAG:       ; %bb.0:
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
 ; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-NEXT:    v_pk_max_u16 v0, v0, s2
@@ -164,6 +179,7 @@ define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b
 ;
 ; GISEL-LABEL: add_max_v2u16_sss:
 ; GISEL:       ; %bb.0:
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s2
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-NEXT:    v_pk_add_max_u16 v0, s0, s1, v0
@@ -177,6 +193,7 @@ define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b
 define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) {
 ; GCN-LABEL: add_max_v2u16_vsi:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_max_u16 v0, v0, s0, 4
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
@@ -188,6 +205,7 @@ define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) {
 define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) {
 ; GCN-LABEL: add_max_v2u16_svl:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_max_u16 v0, s0, v0, 0x650064
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
@@ -199,6 +217,7 @@ define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) {
 define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) {
 ; GCN-LABEL: add_max_v2u16_slv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_max_u16 v0, 0x640064, s0, v0
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> <i16 100, i16 100>)
@@ -210,6 +229,7 @@ define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) {
 define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
 ; GCN-LABEL: add_max_v2s16_vvv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_max_i16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
@@ -221,6 +241,7 @@ define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
 ; GCN-LABEL: add_min_v2u16_vvv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_min_u16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
@@ -232,6 +253,7 @@ define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
 ; GCN-LABEL: add_min_v2s16_vvv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_min_i16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
   %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)

diff  --git a/llvm/test/CodeGen/AMDGPU/add_u64.ll b/llvm/test/CodeGen/AMDGPU/add_u64.ll
index 22acedc4d6e25..bd43737ca5def 100644
--- a/llvm/test/CodeGen/AMDGPU/add_u64.ll
+++ b/llvm/test/CodeGen/AMDGPU/add_u64.ll
@@ -12,6 +12,7 @@ define amdgpu_ps <2 x float> @test_add_u64_vv(i64 %a, i64 %b) {
 ;
 ; GFX1250-LABEL: test_add_u64_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %add = add i64 %a, %b
@@ -29,6 +30,7 @@ define amdgpu_ps <2 x float> @test_add_u64_vs(i64 %a, i64 inreg %b) {
 ;
 ; GFX1250-LABEL: test_add_u64_vs:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %add = add i64 %a, %b
@@ -46,6 +48,7 @@ define amdgpu_ps <2 x float> @test_add_u64_sv(i64 inreg %a, i64 %b) {
 ;
 ; GFX1250-LABEL: test_add_u64_sv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %add = add i64 %a, %b
@@ -54,12 +57,20 @@ define amdgpu_ps <2 x float> @test_add_u64_sv(i64 inreg %a, i64 %b) {
 }
 
 define amdgpu_ps <2 x float> @test_add_u64_ss(i64 inreg %a, i64 inreg %b) {
-; GCN-LABEL: test_add_u64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX12-LABEL: test_add_u64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_ss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT:    ; return to shader part epilog
   %add = add i64 %a, %b
   %ret = bitcast i64 %add to <2 x float>
   ret <2 x float> %ret
@@ -75,6 +86,7 @@ define amdgpu_ps <2 x float> @test_add_u64_v_inline_lit(i64 %a) {
 ;
 ; GFX1250-LABEL: test_add_u64_v_inline_lit:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], 5, v[0:1]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %add = add i64 %a, 5
@@ -92,6 +104,7 @@ define amdgpu_ps <2 x float> @test_add_u64_v_small_imm(i64 %a) {
 ;
 ; GFX1250-LABEL: test_add_u64_v_small_imm:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], 0x1f4, v[0:1]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %add = add i64 %a, 500
@@ -109,6 +122,7 @@ define amdgpu_ps <2 x float> @test_add_u64_v_64bit_imm(i64 %a) {
 ;
 ; GFX1250-LABEL: test_add_u64_v_64bit_imm:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], 0x13b9ac9ff, v[0:1]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %add = add i64 %a, 5294967295
@@ -117,13 +131,23 @@ define amdgpu_ps <2 x float> @test_add_u64_v_64bit_imm(i64 %a) {
 }
 
 define amdgpu_ps <2 x float> @test_add_u64_s_small_imm(i64 inreg %a) {
-; GCN-LABEL: test_add_u64_s_small_imm:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x1f4
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX12-LABEL: test_add_u64_s_small_imm:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x1f4
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_add_u64_s_small_imm:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x1f4
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT:    ; return to shader part epilog
   %add = add i64 %a, 500
   %ret = bitcast i64 %add to <2 x float>
   ret <2 x float> %ret
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
index b486fabb19497..e5992e398ddbd 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-gas.ll
@@ -9,6 +9,7 @@ target triple = "amdgcn-amd-amdhsa"
 define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %ptr) {
 ; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -26,6 +27,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 ;
 ; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo
 ; GFX1250-GISEL-NEXT:    v_mbcnt_lo_u32_b32 v2, -1, 0
@@ -53,6 +55,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast(ptr addrspace(5) %p
 define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspace(5) %ptr) {
 ; GFX1250-SDAG-LABEL: use_private_to_flat_addrspacecast_nonnull:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
@@ -66,6 +69,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspa
 ;
 ; GFX1250-GISEL-LABEL: use_private_to_flat_addrspacecast_nonnull:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_mbcnt_lo_u32_b32 v2, -1, 0
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], src_flat_scratch_base_lo
@@ -87,6 +91,7 @@ define amdgpu_kernel void @use_private_to_flat_addrspacecast_nonnull(ptr addrspa
 define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) {
 ; GFX1250-LABEL: use_flat_to_private_addrspacecast:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -104,6 +109,7 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast(ptr %ptr) {
 define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) {
 ; GFX1250-SDAG-LABEL: use_flat_to_private_addrspacecast_nonnull:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -114,6 +120,7 @@ define amdgpu_kernel void @use_flat_to_private_addrspacecast_nonnull(ptr %ptr) {
 ;
 ; GFX1250-GISEL-LABEL: use_flat_to_private_addrspacecast_nonnull:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
index 9bc1986719df8..b78678bbfc9f6 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -13,6 +13,7 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) {
 ;
 ; GFX1250-LABEL: v_test_cvt_bf16_f32_v:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
 ; GFX1250-NEXT:    ; return to shader part epilog
   %cvt = fpext bfloat %v to float
@@ -28,6 +29,7 @@ define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) {
 ;
 ; GFX1250-LABEL: v_test_cvt_bf16_f32_s:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_lshl_b32 s0, s0, 16
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
@@ -63,6 +65,7 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) {
 ;
 ; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_v:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
 ; GFX1250-NEXT:    ; return to shader part epilog
   %res = fptrunc <2 x float> %src to <2 x bfloat>
@@ -101,6 +104,7 @@ define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) {
 ;
 ; GFX1250-LABEL: v_test_cvt_v2f32_v2bf16_s:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, s0, s1
 ; GFX1250-NEXT:    ; return to shader part epilog
   %res = fptrunc <2 x float> %src to <2 x bfloat>
@@ -129,6 +133,7 @@ define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) {
 ;
 ; GFX1250-LABEL: v_test_cvt_f32_bf16_v:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -205,6 +210,7 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
 ;
 ; GFX1250-LABEL: v_test_cvt_v2f64_v2bf16_v:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_f32_f64_e32 v8, v[2:3]
 ; GFX1250-NEXT:    v_cvt_f32_f64_e32 v9, v[0:1]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -264,6 +270,7 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) {
 ;
 ; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
 ; GFX1250-NEXT:    ; return to shader part epilog
 entry:
@@ -304,6 +311,7 @@ define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) {
 ;
 ; GFX1250-LABEL: fptrunc_f32_f32_to_v2bf16_mods:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, |v1|
 ; GFX1250-NEXT:    ; return to shader part epilog
 entry:
@@ -342,6 +350,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
 ;
 ; GFX1250-LABEL: fptrunc_f32_to_bf16:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
 ; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
@@ -378,6 +387,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
 ;
 ; GFX1250-LABEL: fptrunc_f32_to_bf16_abs:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, |v0|, s0
 ; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
@@ -415,6 +425,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
 ;
 ; GFX1250-LABEL: fptrunc_f32_to_bf16_neg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, -v0, s0
 ; GFX1250-NEXT:    flat_store_b16 v[2:3], v0
@@ -467,6 +478,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
 ;
 ; GFX1250-LABEL: fptrunc_f64_to_bf16:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_f32_f64_e32 v6, v[0:1]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
@@ -529,6 +541,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
 ;
 ; GFX1250-LABEL: fptrunc_f64_to_bf16_neg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_f32_f64_e64 v6, -v[0:1]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6
@@ -593,6 +606,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
 ;
 ; GFX1250-LABEL: fptrunc_f64_to_bf16_abs:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_f32_f64_e64 v6, |v[0:1]|
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_f64_f32_e32 v[4:5], v6

diff  --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
index 39618b05e6c71..94213096b9d0c 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16-math.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
@@ -5,6 +5,7 @@
 define amdgpu_ps void @llvm_sqrt_bf16_v(ptr addrspace(1) %out, bfloat %src) {
 ; GCN-LABEL: llvm_sqrt_bf16_v:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_sqrt_bf16_e32 v2, v2
 ; GCN-NEXT:    global_store_b16 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -16,6 +17,7 @@ define amdgpu_ps void @llvm_sqrt_bf16_v(ptr addrspace(1) %out, bfloat %src) {
 define amdgpu_ps void @llvm_sqrt_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
 ; GCN-LABEL: llvm_sqrt_bf16_s:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_sqrt_bf16_e32 v2, s0
 ; GCN-NEXT:    global_store_b16 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -27,6 +29,7 @@ define amdgpu_ps void @llvm_sqrt_bf16_s(ptr addrspace(1) %out, bfloat inreg %src
 define amdgpu_ps void @v_test_add_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) {
 ; GCN-LABEL: v_test_add_v2bf16_vv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_bf16 v2, v2, v3
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -38,6 +41,7 @@ define amdgpu_ps void @v_test_add_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_add_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
 ; GCN-LABEL: v_test_add_v2bf16_vs:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_bf16 v2, v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -49,6 +53,7 @@ define amdgpu_ps void @v_test_add_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_add_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) {
 ; GCN-LABEL: v_test_add_v2bf16_ss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_bf16 v2, s0, s1
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -60,6 +65,7 @@ define amdgpu_ps void @v_test_add_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_add_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) {
 ; GCN-LABEL: v_test_add_v2bf16_vc:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_bf16 v2, v2, 2.0 op_sel_hi:[1,0]
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -71,6 +77,7 @@ define amdgpu_ps void @v_test_add_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_add_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) {
 ; GCN-LABEL: v_test_add_v2bf16_vl:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_bf16 v2, 0x42c83f80, v2
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -82,6 +89,7 @@ define amdgpu_ps void @v_test_add_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_sub_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) {
 ; GCN-LABEL: v_test_sub_v2bf16_vv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_bf16 v2, v2, v3 neg_lo:[0,1] neg_hi:[0,1]
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -93,6 +101,7 @@ define amdgpu_ps void @v_test_sub_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_sub_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
 ; GCN-LABEL: v_test_sub_v2bf16_vs:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_bf16 v2, v2, s0 neg_lo:[0,1] neg_hi:[0,1]
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -104,6 +113,7 @@ define amdgpu_ps void @v_test_sub_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_sub_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) {
 ; GCN-LABEL: v_test_sub_v2bf16_ss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_bf16 v2, s0, s1 neg_lo:[0,1] neg_hi:[0,1]
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -115,6 +125,7 @@ define amdgpu_ps void @v_test_sub_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_sub_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) {
 ; GCN-LABEL: v_test_sub_v2bf16_vc:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_bf16 v2, v2, -2.0 op_sel_hi:[1,0]
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -126,6 +137,7 @@ define amdgpu_ps void @v_test_sub_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_sub_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) {
 ; GCN-LABEL: v_test_sub_v2bf16_vl:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_bf16 v2, 0xc2c8bf80, v2
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -137,6 +149,7 @@ define amdgpu_ps void @v_test_sub_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_sub_v2bf16_lv(ptr addrspace(1) %out, <2 x bfloat> %a) {
 ; GCN-LABEL: v_test_sub_v2bf16_lv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_bf16 v2, 0x42c83f80, v2 neg_lo:[0,1] neg_hi:[0,1]
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -148,6 +161,7 @@ define amdgpu_ps void @v_test_sub_v2bf16_lv(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_sub_v2bf16_iv(ptr addrspace(1) %out, <2 x bfloat> %a) {
 ; GCN-LABEL: v_test_sub_v2bf16_iv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_add_bf16 v2, v2, 1.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0]
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -159,6 +173,7 @@ define amdgpu_ps void @v_test_sub_v2bf16_iv(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_mul_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) {
 ; GCN-LABEL: v_test_mul_v2bf16_vv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_mul_bf16 v2, v2, v3
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -170,6 +185,7 @@ define amdgpu_ps void @v_test_mul_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_mul_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
 ; GCN-LABEL: v_test_mul_v2bf16_vs:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_mul_bf16 v2, v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -181,6 +197,7 @@ define amdgpu_ps void @v_test_mul_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_mul_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) {
 ; GCN-LABEL: v_test_mul_v2bf16_ss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_mul_bf16 v2, s0, s1
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -194,6 +211,7 @@ define amdgpu_ps void @v_test_mul_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_mul_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) {
 ; GCN-LABEL: v_test_mul_v2bf16_vc:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_mul_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -205,6 +223,7 @@ define amdgpu_ps void @v_test_mul_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_mul_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) {
 ; GCN-LABEL: v_test_mul_v2bf16_vl:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_mul_bf16 v2, 0x42c83f80, v2
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -216,6 +235,7 @@ define amdgpu_ps void @v_test_mul_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_min_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) {
 ; GCN-LABEL: v_test_min_v2bf16_vv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_min_num_bf16 v2, v2, v3
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -227,6 +247,7 @@ define amdgpu_ps void @v_test_min_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_min_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
 ; GCN-LABEL: v_test_min_v2bf16_vs:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_min_num_bf16 v2, v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -238,6 +259,7 @@ define amdgpu_ps void @v_test_min_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_min_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) {
 ; GCN-LABEL: v_test_min_v2bf16_ss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_min_num_bf16 v2, s0, s1
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -249,6 +271,7 @@ define amdgpu_ps void @v_test_min_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_min_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) {
 ; GCN-LABEL: v_test_min_v2bf16_vc:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_min_num_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -260,6 +283,7 @@ define amdgpu_ps void @v_test_min_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_min_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) {
 ; GCN-LABEL: v_test_min_v2bf16_vl:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_min_num_bf16 v2, 0x42c83f80, v2
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -271,6 +295,7 @@ define amdgpu_ps void @v_test_min_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_max_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b) {
 ; GCN-LABEL: v_test_max_v2bf16_vv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_max_num_bf16 v2, v2, v3
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -282,6 +307,7 @@ define amdgpu_ps void @v_test_max_v2bf16_vv(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_max_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
 ; GCN-LABEL: v_test_max_v2bf16_vs:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_max_num_bf16 v2, v2, s0
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -293,6 +319,7 @@ define amdgpu_ps void @v_test_max_v2bf16_vs(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_max_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b) {
 ; GCN-LABEL: v_test_max_v2bf16_ss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_max_num_bf16 v2, s0, s1
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -304,6 +331,7 @@ define amdgpu_ps void @v_test_max_v2bf16_ss(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_max_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat> %a) {
 ; GCN-LABEL: v_test_max_v2bf16_vc:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_max_num_bf16 v2, v2, 0.5 op_sel_hi:[1,0]
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -315,6 +343,7 @@ define amdgpu_ps void @v_test_max_v2bf16_vc(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat> %a) {
 ; GCN-LABEL: v_test_max_v2bf16_vl:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_max_num_bf16 v2, 0x42c83f80, v2
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -326,6 +355,7 @@ define amdgpu_ps void @v_test_max_v2bf16_vl(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps bfloat @test_clamp_bf16(bfloat %src) {
 ; GCN-LABEL: test_clamp_bf16:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_max_num_bf16 v0, v0, v0 clamp
 ; GCN-NEXT:    ; return to shader part epilog
   %max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
@@ -336,6 +366,7 @@ define amdgpu_ps bfloat @test_clamp_bf16(bfloat %src) {
 define amdgpu_ps bfloat @test_clamp_bf16_s(bfloat inreg %src) {
 ; GCN-LABEL: test_clamp_bf16_s:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_max_num_bf16 v0, s0, s0 clamp
 ; GCN-NEXT:    ; return to shader part epilog
   %max = call bfloat @llvm.maxnum.bf16(bfloat %src, bfloat 0.0)
@@ -346,6 +377,7 @@ define amdgpu_ps bfloat @test_clamp_bf16_s(bfloat inreg %src) {
 define amdgpu_ps float @test_clamp_v2bf16(<2 x bfloat> %src) {
 ; GCN-LABEL: test_clamp_v2bf16:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_max_num_bf16 v0, v0, v0 clamp
 ; GCN-NEXT:    ; return to shader part epilog
   %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
@@ -357,6 +389,7 @@ define amdgpu_ps float @test_clamp_v2bf16(<2 x bfloat> %src) {
 define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
 ; GCN-LABEL: test_clamp_v2bf16_s:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_max_num_bf16 v0, s0, s0 clamp
 ; GCN-NEXT:    ; return to shader part epilog
   %max = call <2 x bfloat> @llvm.maxnum.v2bf16(<2 x bfloat> %src, <2 x bfloat> <bfloat 0.0, bfloat 0.0>)
@@ -368,6 +401,7 @@ define amdgpu_ps float @test_clamp_v2bf16_s(<2 x bfloat> inreg %src) {
 define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
 ; GCN-LABEL: test_clamp_bf16_folding:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_exp_bf16_e32 v0, v0
 ; GCN-NEXT:    v_nop
 ; GCN-NEXT:    s_delay_alu instid0(TRANS32_DEP_1)
@@ -382,6 +416,7 @@ define amdgpu_ps bfloat @test_clamp_bf16_folding(bfloat %src) {
 define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloat> %src1) {
 ; GCN-LABEL: test_clamp_v2bf16_folding:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_mul_bf16 v0, v0, v1 clamp
 ; GCN-NEXT:    ; return to shader part epilog
   %mul = fmul <2 x bfloat> %src0, %src1
@@ -394,6 +429,7 @@ define amdgpu_ps float @test_clamp_v2bf16_folding(<2 x bfloat> %src0, <2 x bfloa
 define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
 ; GCN-LABEL: v_test_mul_add_v2bf16_vvv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_fma_bf16 v2, v2, v3, v4
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -406,6 +442,7 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfl
 define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
 ; GCN-LABEL: v_test_mul_add_v2bf16_vss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_fma_bf16 v2, v2, s0, s1
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -418,6 +455,7 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vss(ptr addrspace(1) %out, <2 x bfl
 define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
 ; GCN-LABEL: v_test_mul_add_v2bf16_sss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    v_pk_fma_bf16 v2, s0, s1, v2
@@ -432,6 +470,7 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_sss(ptr addrspace(1) %out, <2 x bfl
 define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
 ; GCN-LABEL: v_test_mul_add_v2bf16_vsc:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_fma_bf16 v2, v2, s0, 0.5 op_sel_hi:[1,1,0]
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -444,6 +483,7 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfl
 define amdgpu_ps void @v_test_mul_add_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) {
 ; GCN-LABEL: v_test_mul_add_v2bf16_vll:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_mov_b32 s0, 0x43484000
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_pk_fma_bf16 v2, 0x42c83f80, v2, s0
@@ -458,6 +498,7 @@ define amdgpu_ps void @v_test_mul_add_v2bf16_vll(ptr addrspace(1) %out, <2 x bfl
 define amdgpu_ps void @v_test_fma_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c) {
 ; GCN-LABEL: v_test_fma_v2bf16_vvv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_fma_bf16 v2, v2, v3, v4
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -469,6 +510,7 @@ define amdgpu_ps void @v_test_fma_v2bf16_vvv(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_fma_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
 ; GCN-LABEL: v_test_fma_v2bf16_vss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_fma_bf16 v2, v2, s0, s1
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -480,6 +522,7 @@ define amdgpu_ps void @v_test_fma_v2bf16_vss(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_fma_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat> inreg %a, <2 x bfloat> inreg %b, <2 x bfloat> inreg %c) {
 ; GCN-LABEL: v_test_fma_v2bf16_sss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b32_e32 v2, s2
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    v_pk_fma_bf16 v2, s0, s1, v2
@@ -493,6 +536,7 @@ define amdgpu_ps void @v_test_fma_v2bf16_sss(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_fma_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat> %a, <2 x bfloat> inreg %b) {
 ; GCN-LABEL: v_test_fma_v2bf16_vsc:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_pk_fma_bf16 v2, v2, s0, 0.5 op_sel_hi:[1,1,0]
 ; GCN-NEXT:    global_store_b32 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -504,6 +548,7 @@ define amdgpu_ps void @v_test_fma_v2bf16_vsc(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @v_test_fma_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat> %a) {
 ; GCN-LABEL: v_test_fma_v2bf16_vll:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_mov_b32 s0, 0x42c83f80
 ; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-NEXT:    v_pk_fma_bf16 v2, v2, s0, 0x43484000
@@ -517,6 +562,7 @@ define amdgpu_ps void @v_test_fma_v2bf16_vll(ptr addrspace(1) %out, <2 x bfloat>
 define amdgpu_ps void @llvm_log2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
 ; GCN-LABEL: llvm_log2_bf16_v:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_log_bf16_e32 v2, v2
 ; GCN-NEXT:    global_store_b16 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -528,6 +574,7 @@ define amdgpu_ps void @llvm_log2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
 define amdgpu_ps void @llvm_log2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
 ; GCN-LABEL: llvm_log2_bf16_s:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_log_bf16_e32 v2, s0
 ; GCN-NEXT:    global_store_b16 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -539,6 +586,7 @@ define amdgpu_ps void @llvm_log2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src
 define amdgpu_ps void @llvm_exp2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
 ; GCN-LABEL: llvm_exp2_bf16_v:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_exp_bf16_e32 v2, v2
 ; GCN-NEXT:    global_store_b16 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm
@@ -550,6 +598,7 @@ define amdgpu_ps void @llvm_exp2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
 define amdgpu_ps void @llvm_exp2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
 ; GCN-LABEL: llvm_exp2_bf16_s:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_exp_bf16_e32 v2, s0
 ; GCN-NEXT:    global_store_b16 v[0:1], v2, off
 ; GCN-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 28d7e6916e519..ab9cd8e037734 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -21317,6 +21317,7 @@ define amdgpu_ps i32 @s_fabs_bf16(bfloat inreg %a) {
 ;
 ; GFX1250-LABEL: s_fabs_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_and_b32 s0, s0, 0x7fff
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_and_b32 s0, 0xffff, s0
@@ -21433,6 +21434,7 @@ define amdgpu_ps i32 @s_fneg_bf16(bfloat inreg %a) {
 ;
 ; GFX1250-LABEL: s_fneg_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_xor_b32 s0, s0, 0x8000
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_and_b32 s0, 0xffff, s0
@@ -21554,6 +21556,7 @@ define amdgpu_ps i32 @s_fneg_fabs_bf16(bfloat inreg %a) {
 ;
 ; GFX1250-LABEL: s_fneg_fabs_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_bitset1_b32 s0, 15
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_and_b32 s0, 0xffff, s0
@@ -44236,6 +44239,7 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
 ;
 ; GFX1250TRUE16-LABEL: s_select_bf16:
 ; GFX1250TRUE16:       ; %bb.0:
+; GFX1250TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250TRUE16-NEXT:    v_mov_b16_e32 v1.l, s0
 ; GFX1250TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1250TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0
@@ -44246,6 +44250,7 @@ define amdgpu_ps i32 @s_select_bf16(bfloat inreg %a, bfloat inreg %b, i32 %c) {
 ;
 ; GFX1250FAKE16-LABEL: s_select_bf16:
 ; GFX1250FAKE16:       ; %bb.0:
+; GFX1250FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250FAKE16-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX1250FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -44392,6 +44397,7 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ;
 ; GFX1250TRUE16-LABEL: s_select_v2bf16:
 ; GFX1250TRUE16:       ; %bb.0:
+; GFX1250TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250TRUE16-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX1250TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1250TRUE16-NEXT:    v_mov_b16_e32 v1.l, s2
@@ -44406,6 +44412,7 @@ define amdgpu_ps i32 @s_select_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ;
 ; GFX1250FAKE16-LABEL: s_select_v2bf16:
 ; GFX1250FAKE16:       ; %bb.0:
+; GFX1250FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250FAKE16-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX1250FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1250FAKE16-NEXT:    v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s0
@@ -44558,6 +44565,7 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ;
 ; GFX1250TRUE16-LABEL: s_vselect_v2bf16:
 ; GFX1250TRUE16:       ; %bb.0:
+; GFX1250TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250TRUE16-NEXT:    s_lshr_b32 s3, s0, 16
 ; GFX1250TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1250TRUE16-NEXT:    v_cmp_eq_u32_e64 s2, 0, v1
@@ -44573,6 +44581,7 @@ define amdgpu_ps i32 @s_vselect_v2bf16(<2 x bfloat> inreg %a, <2 x bfloat> inreg
 ;
 ; GFX1250FAKE16-LABEL: s_vselect_v2bf16:
 ; GFX1250FAKE16:       ; %bb.0:
+; GFX1250FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250FAKE16-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX1250FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX1250FAKE16-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s0
@@ -46190,6 +46199,7 @@ define amdgpu_ps <2 x i32> @s_select_v3bf16(<3 x bfloat> inreg %a, <3 x bfloat>
 ;
 ; GFX1250-LABEL: s_select_v3bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, s0 :: v_dual_mov_b32 v2, s1
 ; GFX1250-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -46330,6 +46340,7 @@ define amdgpu_ps <2 x i32> @s_select_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ;
 ; GFX1250-LABEL: s_select_v4bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s0
 ; GFX1250-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -46576,6 +46587,7 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ;
 ; GFX1250TRUE16-LABEL: s_vselect_v4bf16:
 ; GFX1250TRUE16:       ; %bb.0:
+; GFX1250TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250TRUE16-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX1250TRUE16-NEXT:    s_lshr_b32 s9, s0, 16
 ; GFX1250TRUE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
@@ -46599,6 +46611,7 @@ define amdgpu_ps <2 x i32> @s_vselect_v4bf16(<4 x bfloat> inreg %a, <4 x bfloat>
 ;
 ; GFX1250FAKE16-LABEL: s_vselect_v4bf16:
 ; GFX1250FAKE16:       ; %bb.0:
+; GFX1250FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250FAKE16-NEXT:    s_lshr_b32 s4, s1, 16
 ; GFX1250FAKE16-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX1250FAKE16-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s1

diff  --git a/llvm/test/CodeGen/AMDGPU/bitop3.ll b/llvm/test/CodeGen/AMDGPU/bitop3.ll
index 478460595b5b7..c85702021adc9 100644
--- a/llvm/test/CodeGen/AMDGPU/bitop3.ll
+++ b/llvm/test/CodeGen/AMDGPU/bitop3.ll
@@ -9,10 +9,16 @@
 ; ========= Single bit functions =========
 
 define amdgpu_ps float @not_and_not_and_not_and(i32 %a, i32 %b, i32 %c) {
-; GCN-LABEL: not_and_not_and_not_and:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX950-LABEL: not_and_not_and_not_and:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:1
+; GFX950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: not_and_not_and_not_and:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:1
+; GFX1250-NEXT:    ; return to shader part epilog
   %nota = xor i32 %a, -1
   %notb = xor i32 %b, -1
   %notc = xor i32 %c, -1
@@ -23,10 +29,16 @@ define amdgpu_ps float @not_and_not_and_not_and(i32 %a, i32 %b, i32 %c) {
 }
 
 define amdgpu_ps float @not_and_not_and_and(i32 %a, i32 %b, i32 %c) {
-; GCN-LABEL: not_and_not_and_and:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:2
-; GCN-NEXT:    ; return to shader part epilog
+; GFX950-LABEL: not_and_not_and_and:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:2
+; GFX950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: not_and_not_and_and:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:2
+; GFX1250-NEXT:    ; return to shader part epilog
   %nota = xor i32 %a, -1
   %notb = xor i32 %b, -1
   %and1 = and i32 %nota, %c
@@ -36,10 +48,16 @@ define amdgpu_ps float @not_and_not_and_and(i32 %a, i32 %b, i32 %c) {
 }
 
 define amdgpu_ps float @not_and_and_not_and(i32 %a, i32 %b, i32 %c) {
-; GCN-LABEL: not_and_and_not_and:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:4
-; GCN-NEXT:    ; return to shader part epilog
+; GFX950-LABEL: not_and_and_not_and:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:4
+; GFX950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: not_and_and_not_and:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:4
+; GFX1250-NEXT:    ; return to shader part epilog
   %nota = xor i32 %a, -1
   %notc = xor i32 %c, -1
   %and1 = and i32 %nota, %notc
@@ -62,11 +80,13 @@ define amdgpu_ps float @not_and_and_and(i32 %a, i32 %b, i32 %c) {
 ;
 ; GFX1250-SDAG-LABEL: not_and_and_and:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:8
 ; GFX1250-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-LABEL: not_and_and_and:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_bitop3_b32 v0, v0, v2, v0 bitop3:0xc
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
@@ -79,10 +99,16 @@ define amdgpu_ps float @not_and_and_and(i32 %a, i32 %b, i32 %c) {
 }
 
 define amdgpu_ps float @and_not_and_not_and(i32 %a, i32 %b, i32 %c) {
-; GCN-LABEL: and_not_and_not_and:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x10
-; GCN-NEXT:    ; return to shader part epilog
+; GFX950-LABEL: and_not_and_not_and:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x10
+; GFX950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: and_not_and_not_and:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x10
+; GFX1250-NEXT:    ; return to shader part epilog
   %notb = xor i32 %b, -1
   %notc = xor i32 %c, -1
   %and1 = and i32 %a, %notc
@@ -105,11 +131,13 @@ define amdgpu_ps float @and_not_and_and(i32 %a, i32 %b, i32 %c) {
 ;
 ; GFX1250-SDAG-LABEL: and_not_and_and:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x20
 ; GFX1250-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-LABEL: and_not_and_and:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, v0, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_bfi_b32 v0, v1, 0, v0
@@ -135,11 +163,13 @@ define amdgpu_ps float @and_and_not_and(i32 %a, i32 %b, i32 %c) {
 ;
 ; GFX1250-SDAG-LABEL: and_and_not_and:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x40
 ; GFX1250-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-LABEL: and_and_not_and:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_bitop3_b32 v0, v0, v2, v0 bitop3:0x30
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, v0, v1
@@ -152,10 +182,16 @@ define amdgpu_ps float @and_and_not_and(i32 %a, i32 %b, i32 %c) {
 }
 
 define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) {
-; GCN-LABEL: and_and_and:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
-; GCN-NEXT:    ; return to shader part epilog
+; GFX950-LABEL: and_and_and:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
+; GFX950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: and_and_and:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x80
+; GFX1250-NEXT:    ; return to shader part epilog
   %and1 = and i32 %a, %c
   %and2 = and i32 %and1, %b
   %ret_cast = bitcast i32 %and2 to float
@@ -165,10 +201,16 @@ define amdgpu_ps float @and_and_and(i32 %a, i32 %b, i32 %c) {
 ; ========= Multi bit functions =========
 
 define amdgpu_ps float @test_12(i32 %a, i32 %b) {
-; GCN-LABEL: test_12:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
-; GCN-NEXT:    ; return to shader part epilog
+; GFX950-LABEL: test_12:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
+; GFX950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_12:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
+; GFX1250-NEXT:    ; return to shader part epilog
   %nota = xor i32 %a, -1
   %and1 = and i32 %nota, %b
   %ret_cast = bitcast i32 %and1 to float
@@ -189,11 +231,13 @@ define amdgpu_ps float @test_63(i32 %a, i32 %b) {
 ;
 ; GFX1250-SDAG-LABEL: test_63:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0x3f
 ; GFX1250-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-LABEL: test_63:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_not_b32_e32 v1, v1
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_bfi_b32 v0, v0, v1, -1
@@ -206,10 +250,16 @@ define amdgpu_ps float @test_63(i32 %a, i32 %b) {
 }
 
 define amdgpu_ps float @test_59(i32 %a, i32 %b, i32 %c) {
-; GCN-LABEL: test_59:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x3b
-; GCN-NEXT:    ; return to shader part epilog
+; GFX950-LABEL: test_59:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x3b
+; GFX950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_59:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0x3b
+; GFX1250-NEXT:    ; return to shader part epilog
   %nota = xor i32 %a, -1
   %notb = xor i32 %b, -1
   %and1 = and i32 %nota, %c
@@ -233,11 +283,13 @@ define amdgpu_ps float @test_126(i32 %a, i32 %b, i32 %c) {
 ;
 ; GFX1250-SDAG-LABEL: test_126:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_bitop3_b32 v0, v0, v2, v1 bitop3:0x7e
 ; GFX1250-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-LABEL: test_126:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v1, v0, v1
 ; GFX1250-GISEL-NEXT:    v_xor_b32_e32 v0, v0, v2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -271,11 +323,13 @@ define amdgpu_ps float @test_12_src_overflow(i32 %a, i32 %b, i32 %c) {
 ;
 ; GFX1250-SDAG-LABEL: test_12_src_overflow:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_bitop3_b32 v0, v0, v1, v0 bitop3:0xc
 ; GFX1250-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-LABEL: test_12_src_overflow:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_not_b32_e32 v3, v0
 ; GFX1250-GISEL-NEXT:    v_bitop3_b32 v0, v0, v2, v0 bitop3:0xc
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -317,6 +371,7 @@ define amdgpu_ps float @test_100_src_overflow(i32 %a, i32 %b, i32 %c) {
 ;
 ; GFX1250-SDAG-LABEL: test_100_src_overflow:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_bitop3_b32 v3, v1, v2, v0 bitop3:0x10
 ; GFX1250-SDAG-NEXT:    v_bitop3_b32 v4, v0, v2, v1 bitop3:0x40
 ; GFX1250-SDAG-NEXT:    v_bitop3_b32 v0, v1, v2, v0 bitop3:0x20
@@ -326,6 +381,7 @@ define amdgpu_ps float @test_100_src_overflow(i32 %a, i32 %b, i32 %c) {
 ;
 ; GFX1250-GISEL-LABEL: test_100_src_overflow:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_or_b32_e32 v3, v2, v0
 ; GFX1250-GISEL-NEXT:    v_bitop3_b32 v4, v0, v1, v0 bitop3:0x30
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, v1, v0
@@ -362,6 +418,7 @@ define amdgpu_ps float @test_xor3(i32 %a, i32 %b, i32 %c) {
 ;
 ; GFX1250-LABEL: test_xor3:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_xor3_b32 v0, v0, v1, v2
 ; GFX1250-NEXT:    ; return to shader part epilog
   %xor1 = xor i32 %a, %b
@@ -371,10 +428,16 @@ define amdgpu_ps float @test_xor3(i32 %a, i32 %b, i32 %c) {
 }
 
 define amdgpu_ps float @test_or3(i32 %a, i32 %b, i32 %c) {
-; GCN-LABEL: test_or3:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_or3_b32 v0, v0, v1, v2
-; GCN-NEXT:    ; return to shader part epilog
+; GFX950-LABEL: test_or3:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_or3:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX1250-NEXT:    ; return to shader part epilog
   %or1 = or i32 %a, %b
   %or2 = or i32 %or1, %c
   %ret_cast = bitcast i32 %or2 to float
@@ -382,10 +445,16 @@ define amdgpu_ps float @test_or3(i32 %a, i32 %b, i32 %c) {
 }
 
 define amdgpu_ps float @test_and_or(i32 %a, i32 %b, i32 %c) {
-; GCN-LABEL: test_and_or:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_and_or_b32 v0, v0, v1, v2
-; GCN-NEXT:    ; return to shader part epilog
+; GFX950-LABEL: test_and_or:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_and_or_b32 v0, v0, v1, v2
+; GFX950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_and_or:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_and_or_b32 v0, v0, v1, v2
+; GFX1250-NEXT:    ; return to shader part epilog
   %and1 = and i32 %a, %b
   %or1 = or i32 %and1, %c
   %ret_cast = bitcast i32 %or1 to float
@@ -404,6 +473,7 @@ define amdgpu_ps float @uniform_3_op(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
 ;
 ; GFX1250-LABEL: uniform_3_op:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_and_not1_b32 s0, s2, s0
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_and_b32 s0, s0, s1
@@ -433,6 +503,7 @@ define amdgpu_ps float @uniform_4_op(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
 ;
 ; GFX1250-SDAG-LABEL: uniform_4_op:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_bitop3_b32 v0, s0, s1, v0 bitop3:2
@@ -440,6 +511,7 @@ define amdgpu_ps float @uniform_4_op(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
 ;
 ; GFX1250-GISEL-LABEL: uniform_4_op:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_and_not1_b32 s0, s2, s0
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-GISEL-NEXT:    s_and_not1_b32 s0, s0, s1
@@ -463,21 +535,25 @@ define amdgpu_ps half @not_and_not_and_not_and_b16(i16 %a, i16 %b, i16 %c) {
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: not_and_not_and_not_and_b16:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v1, v2 bitop3:1
 ; GFX1250-SDAG-FAKE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: not_and_not_and_not_and_b16:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v1.l, v2.l bitop3:1
 ; GFX1250-SDAG-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: not_and_not_and_not_and_b16:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v1, v2 bitop3:1
 ; GFX1250-GISEL-FAKE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: not_and_not_and_not_and_b16:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0:
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v1.l, v2.l bitop3:1
 ; GFX1250-GISEL-TRUE16-NEXT:    ; return to shader part epilog
   %nota = xor i16 %a, -1
@@ -497,21 +573,25 @@ define amdgpu_ps half @not_and_not_and_and_b16(i16 %a, i16 %b, i16 %c) {
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: not_and_not_and_and_b16:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v1, v2 bitop3:2
 ; GFX1250-SDAG-FAKE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: not_and_not_and_and_b16:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v1.l, v2.l bitop3:2
 ; GFX1250-SDAG-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: not_and_not_and_and_b16:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v1, v2 bitop3:2
 ; GFX1250-GISEL-FAKE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: not_and_not_and_and_b16:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0:
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v1.l, v2.l bitop3:2
 ; GFX1250-GISEL-TRUE16-NEXT:    ; return to shader part epilog
   %nota = xor i16 %a, -1
@@ -530,21 +610,25 @@ define amdgpu_ps half @not_and_and_not_and_b16(i16 %a, i16 %b, i16 %c) {
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: not_and_and_not_and_b16:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v1, v2 bitop3:4
 ; GFX1250-SDAG-FAKE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: not_and_and_not_and_b16:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v1.l, v2.l bitop3:4
 ; GFX1250-SDAG-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: not_and_and_not_and_b16:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v1, v2 bitop3:4
 ; GFX1250-GISEL-FAKE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: not_and_and_not_and_b16:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0:
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v1.l, v2.l bitop3:4
 ; GFX1250-GISEL-TRUE16-NEXT:    ; return to shader part epilog
   %nota = xor i16 %a, -1
@@ -569,16 +653,19 @@ define amdgpu_ps half @test_xor3_b16(i16 %a, i16 %b, i16 %c) {
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_xor3_b16:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v2, v1 bitop3:0x96
 ; GFX1250-SDAG-FAKE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: test_xor3_b16:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v2.l, v1.l bitop3:0x96
 ; GFX1250-SDAG-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-LABEL: test_xor3_b16:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_xor3_b32 v0, v0, v1, v2
 ; GFX1250-GISEL-NEXT:    ; return to shader part epilog
   %xor1 = xor i16 %a, %b
@@ -600,16 +687,19 @@ define amdgpu_ps half @test_or3_b16(i16 %a, i16 %b, i16 %c) {
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_or3_b16:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v2, v1 bitop3:0xfe
 ; GFX1250-SDAG-FAKE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: test_or3_b16:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v2.l, v1.l bitop3:0xfe
 ; GFX1250-SDAG-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-LABEL: test_or3_b16:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX1250-GISEL-NEXT:    ; return to shader part epilog
   %or1 = or i16 %a, %b
@@ -631,16 +721,19 @@ define amdgpu_ps half @test_and_or_b16(i16 %a, i16 %b, i16 %c) {
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_and_or_b16:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v2, v1 bitop3:0xec
 ; GFX1250-SDAG-FAKE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: test_and_or_b16:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v2.l, v1.l bitop3:0xec
 ; GFX1250-SDAG-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-LABEL: test_and_or_b16:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, v0, v1, v2
 ; GFX1250-GISEL-NEXT:    ; return to shader part epilog
   %and1 = and i16 %a, %b
@@ -649,5 +742,6 @@ define amdgpu_ps half @test_and_or_b16(i16 %a, i16 %b, i16 %c) {
   ret half %ret_cast
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
 ; GFX1250-FAKE16: {{.*}}
 ; GFX1250-TRUE16: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
index 31307b245bafe..f784e0df723e9 100644
--- a/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation-gfx1250.ll
@@ -22,6 +22,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 {
 ; GCN-LABEL: uniform_conditional_max_short_forward_branch:
 ; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_cmp_eq_u32 s0, 0
@@ -62,6 +63,7 @@ bb3:
 define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %cnd) #0 {
 ; GCN-LABEL: uniform_conditional_min_long_forward_branch:
 ; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_cmp_eq_u32 s0, 0
@@ -104,6 +106,7 @@ bb3:
 define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(ptr addrspace(1) %arg, float %cnd) #0 {
 ; GCN-LABEL: uniform_conditional_min_long_forward_vcnd_branch:
 ; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_cmp_eq_f32 s0, 0
@@ -146,6 +149,7 @@ bb3:
 define amdgpu_kernel void @min_long_forward_vbranch(ptr addrspace(1) %arg) #0 {
 ; GCN-LABEL: min_long_forward_vbranch:
 ; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -198,6 +202,7 @@ bb3:
 define amdgpu_kernel void @long_backward_sbranch(ptr addrspace(1) %arg) #0 {
 ; GCN-LABEL: long_backward_sbranch:
 ; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_mov_b32 s0, 0
 ; GCN-NEXT:  .LBB4_1: ; %bb2
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -238,6 +243,7 @@ bb3:
 define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(ptr addrspace(1) %arg, i32 %arg1) {
 ; GCN-LABEL: uniform_unconditional_min_long_forward_branch:
 ; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b32 s0, s[4:5], 0x2c
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_cmp_eq_u32 s0, 0
@@ -299,6 +305,7 @@ bb4:
 define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(ptr addrspace(1) %arg, i32 %arg1) {
 ; GCN-LABEL: uniform_unconditional_min_long_backward_branch:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_mov_b32 vcc_lo, exec_lo
 ; GCN-NEXT:  .LBB6_1: ; %loop
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -334,6 +341,7 @@ loop:
 define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
 ; GCN-LABEL: expand_requires_expand:
 ; GCN:       ; %bb.0: ; %bb0
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_cmp_lt_i32 s0, 0
@@ -401,6 +409,7 @@ bb3:
 define amdgpu_kernel void @uniform_inside_divergent(ptr addrspace(1) %out, i32 %cond) #0 {
 ; GCN-LABEL: uniform_inside_divergent:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GCN-NEXT:    s_mov_b32 s3, exec_lo
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -450,6 +459,7 @@ endif:
 define amdgpu_kernel void @analyze_mask_branch() #0 {
 ; GCN-LABEL: analyze_mask_branch:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_mov_b32 s0, exec_lo
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    v_mov_b32_e64 v0, 0
@@ -518,6 +528,7 @@ ret:
 define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i64 %arg5) #0 {
 ; GCN-LABEL: long_branch_hang:
 ; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
 ; GCN-NEXT:    s_mov_b32 s7, -1
 ; GCN-NEXT:    s_wait_kmcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 7dbbeaabeb715..ef5438e63f667 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -39,6 +39,7 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) {
 ;
 ; GFX1250-LABEL: kernel:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -83,11 +84,15 @@ define amdgpu_ps half @ps_ret_cc_f16(half %arg0) {
 ;
 ; GFX1250-TRUE16-LABEL: ps_ret_cc_f16:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
 ; GFX1250-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-FAKE16-LABEL: ps_ret_cc_f16:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
@@ -119,6 +124,8 @@ define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) {
 ;
 ; GFX1250-LABEL: ps_ret_cc_inreg_f16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-NEXT:    s_add_f16 s0, s0, 1.0
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
@@ -277,6 +284,8 @@ define amdgpu_kernel void @call_coldcc() #0 {
 ;
 ; GFX1250-LABEL: call_coldcc:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mov_b32 s32, 0
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1250-NEXT:    s_get_pc_i64 s[6:7]
 ; GFX1250-NEXT:    s_add_nc_u64 s[6:7], s[6:7], coldcc at gotpcrel+4
@@ -286,7 +295,6 @@ define amdgpu_kernel void @call_coldcc() #0 {
 ; GFX1250-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1250-NEXT:    s_mov_b32 s32, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[12:13]
 ; GFX1250-NEXT:    global_store_b32 v[0:1], v0, off
@@ -388,6 +396,8 @@ define amdgpu_kernel void @call_fastcc() #0 {
 ;
 ; GFX1250-LABEL: call_fastcc:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mov_b32 s32, 0
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1250-NEXT:    s_get_pc_i64 s[6:7]
 ; GFX1250-NEXT:    s_add_nc_u64 s[6:7], s[6:7], fastcc at gotpcrel+4
@@ -397,7 +407,6 @@ define amdgpu_kernel void @call_fastcc() #0 {
 ; GFX1250-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1250-NEXT:    s_mov_b32 s32, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[12:13]
 ; GFX1250-NEXT:    global_store_b32 v[0:1], v0, off
@@ -433,11 +442,15 @@ define amdgpu_cs half @cs_mesa(half %arg0) {
 ;
 ; GFX1250-TRUE16-LABEL: cs_mesa:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
 ; GFX1250-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-FAKE16-LABEL: cs_mesa:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
@@ -470,11 +483,15 @@ define amdgpu_ps half @ps_mesa_f16(half %arg0) {
 ;
 ; GFX1250-TRUE16-LABEL: ps_mesa_f16:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
 ; GFX1250-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-FAKE16-LABEL: ps_mesa_f16:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
@@ -507,11 +524,15 @@ define amdgpu_vs half @vs_mesa(half %arg0) {
 ;
 ; GFX1250-TRUE16-LABEL: vs_mesa:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
 ; GFX1250-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-FAKE16-LABEL: vs_mesa:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
@@ -544,11 +565,15 @@ define amdgpu_gs half @gs_mesa(half %arg0) {
 ;
 ; GFX1250-TRUE16-LABEL: gs_mesa:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
 ; GFX1250-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-FAKE16-LABEL: gs_mesa:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
@@ -581,11 +606,15 @@ define amdgpu_hs half @hs_mesa(half %arg0) {
 ;
 ; GFX1250-TRUE16-LABEL: hs_mesa:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
 ; GFX1250-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-FAKE16-LABEL: hs_mesa:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
@@ -620,6 +649,8 @@ define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_v2f16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %add = fadd <2 x half> %arg0, <half 1.0, half 1.0>
@@ -654,6 +685,8 @@ define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_inreg_v2f16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-NEXT:    v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %add = fadd <2 x half> %arg0, <half 1.0, half 1.0>
@@ -690,6 +723,7 @@ define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_v2i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX1250-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX1250-NEXT:    s_endpgm
@@ -731,6 +765,7 @@ define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_inreg_v2i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_pk_add_u16 v0, s0, 1 op_sel_hi:[1,0]
 ; GFX1250-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX1250-NEXT:    s_endpgm
@@ -777,6 +812,8 @@ define amdgpu_ps <4 x half> @ps_mesa_v4f16(<4 x half> %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_v4f16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
 ; GFX1250-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -824,6 +861,8 @@ define amdgpu_ps <4 x half> @ps_mesa_inreg_v4f16(<4 x half> inreg %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_inreg_v4f16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-NEXT:    v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0]
 ; GFX1250-NEXT:    v_pk_add_f16 v1, s1, 1.0 op_sel_hi:[1,0]
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -871,6 +910,7 @@ define amdgpu_ps void @ps_mesa_inreg_v3i32(<3 x i32> inreg %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_inreg_v3i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_add_co_i32 s2, s2, 3
 ; GFX1250-NEXT:    s_add_co_i32 s0, s0, 1
 ; GFX1250-NEXT:    s_add_co_i32 s1, s1, 2
@@ -914,6 +954,7 @@ define amdgpu_ps void @ps_mesa_inreg_v3f32(<3 x float> inreg %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_inreg_v3f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_add_f32 s0, s0, 1.0
 ; GFX1250-NEXT:    s_add_f32 s1, s1, 2.0
 ; GFX1250-NEXT:    s_add_f32 s2, s2, 4.0
@@ -980,6 +1021,7 @@ define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_inreg_v5i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_add_co_i32 s3, s3, 4
 ; GFX1250-NEXT:    s_add_co_i32 s2, s2, 3
 ; GFX1250-NEXT:    s_add_co_i32 s1, s1, 2
@@ -1037,6 +1079,7 @@ define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_inreg_v5f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_add_f32 s3, s3, -1.0
 ; GFX1250-NEXT:    s_add_f32 s4, s4, 0.5
 ; GFX1250-NEXT:    s_add_f32 s0, s0, 1.0
@@ -1085,6 +1128,7 @@ define amdgpu_ps void @ps_mesa_v3i32(<3 x i32> %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_v3i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_add_nc_u32 v2, 3, v2 :: v_dual_add_nc_u32 v1, 2, v1
 ; GFX1250-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX1250-NEXT:    global_store_b96 v[0:1], v[0:2], off
@@ -1123,6 +1167,7 @@ define amdgpu_ps void @ps_mesa_v3f32(<3 x float> %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_v3f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_add_f32 v2, 4.0, v2 :: v_dual_add_f32 v1, 2.0, v1
 ; GFX1250-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GFX1250-NEXT:    global_store_b96 v[0:1], v[0:2], off
@@ -1171,6 +1216,7 @@ define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_v5i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_add_nc_u32 v3, 4, v3 :: v_dual_add_nc_u32 v2, 3, v2
 ; GFX1250-NEXT:    v_dual_add_nc_u32 v1, 2, v1 :: v_dual_add_nc_u32 v4, 5, v4
 ; GFX1250-NEXT:    v_add_nc_u32_e32 v0, 1, v0
@@ -1220,6 +1266,7 @@ define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_v5f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2
 ; GFX1250-NEXT:    v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4
 ; GFX1250-NEXT:    v_add_f32_e32 v0, 1.0, v0
@@ -1261,12 +1308,14 @@ define amdgpu_ps void @ps_mesa_i16(i16 %arg0) {
 ;
 ; GFX1250-TRUE16-LABEL: ps_mesa_i16:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.l
 ; GFX1250-TRUE16-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX1250-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1250-FAKE16-LABEL: ps_mesa_i16:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v0
 ; GFX1250-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX1250-FAKE16-NEXT:    s_endpgm
@@ -1302,6 +1351,7 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) {
 ;
 ; GFX1250-LABEL: ps_mesa_inreg_i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_add_co_i32 s0, s0, s0
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
@@ -1313,10 +1363,26 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) {
 }
 
 define amdgpu_ps i16 @ret_ps_mesa_i16() {
-; GCN-LABEL: ret_ps_mesa_i16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_movk_i32 s0, 0x7b
-; GCN-NEXT:    ; return to shader part epilog
+; SI-LABEL: ret_ps_mesa_i16:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_movk_i32 s0, 0x7b
+; SI-NEXT:    ; return to shader part epilog
+;
+; VI-LABEL: ret_ps_mesa_i16:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_movk_i32 s0, 0x7b
+; VI-NEXT:    ; return to shader part epilog
+;
+; GFX11-LABEL: ret_ps_mesa_i16:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_movk_i32 s0, 0x7b
+; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: ret_ps_mesa_i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_movk_i32 s0, 0x7b
+; GFX1250-NEXT:    ; return to shader part epilog
   ret i16 123
 }
 
@@ -1353,6 +1419,7 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) {
 ;
 ; GFX1250-LABEL: amd_kernel_i8:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_add_co_i32 s0, s0, s0
@@ -1418,6 +1485,7 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) {
 ;
 ; GFX1250-LABEL: amd_kernel_v2i8:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1519,6 +1587,7 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) {
 ;
 ; GFX1250-LABEL: amd_kernel_v4i8:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1617,6 +1686,7 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) {
 ;
 ; GFX1250-LABEL: amd_kernel_v3i8:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 2
 ; GFX1250-NEXT:    v_mov_b64_e32 v[2:3], 0
@@ -1737,6 +1807,7 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
 ;
 ; GFX1250-LABEL: amd_kernel_v5i8:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4
 ; GFX1250-NEXT:    v_mov_b64_e32 v[2:3], 0
@@ -1902,6 +1973,7 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
 ;
 ; GFX1250-LABEL: amd_kernel_v8i8:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2179,6 +2251,7 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
 ;
 ; GFX1250-LABEL: amd_kernel_v16i8:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[4:5], 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2700,6 +2773,7 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
 ;
 ; GFX1250-LABEL: amd_kernel_v32i8:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[8:9], 16
 ; GFX1250-NEXT:    v_mov_b64_e32 v[10:11], 0
@@ -2869,6 +2943,7 @@ define amdgpu_cs void @amdgpu_cs_i1(i1 %arg0) {
 ;
 ; GFX1250-LABEL: amdgpu_cs_i1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1250-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX1250-NEXT:    s_endpgm
@@ -2993,6 +3068,7 @@ define amdgpu_cs void @amdgpu_cs_v8i1(<8 x i1> %arg0) {
 ;
 ; GFX1250-TRUE16-LABEL: amdgpu_cs_v8i1:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 3, v3.l
 ; GFX1250-TRUE16-NEXT:    v_and_b16 v3.l, v6.l, 1
 ; GFX1250-TRUE16-NEXT:    v_and_b16 v2.l, v2.l, 1
@@ -3018,6 +3094,7 @@ define amdgpu_cs void @amdgpu_cs_v8i1(<8 x i1> %arg0) {
 ;
 ; GFX1250-FAKE16-LABEL: amdgpu_cs_v8i1:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
 ; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v7, 3, v7
@@ -3258,6 +3335,7 @@ define amdgpu_cs void @amdgpu_cs_v16i1(<16 x i1> %arg0) {
 ;
 ; GFX1250-TRUE16-LABEL: amdgpu_cs_v16i1:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    v_and_b16 v2.h, v6.l, 1
 ; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 3, v7.l
 ; GFX1250-TRUE16-NEXT:    v_and_b16 v4.h, v10.l, 1
@@ -3302,6 +3380,7 @@ define amdgpu_cs void @amdgpu_cs_v16i1(<16 x i1> %arg0) {
 ;
 ; GFX1250-FAKE16-LABEL: amdgpu_cs_v16i1:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
 ; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v7, 3, v7
 ; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
@@ -3742,6 +3821,7 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
 ;
 ; GFX1250-TRUE16-LABEL: amdgpu_cs_v32i1:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 3, v19.l
 ; GFX1250-TRUE16-NEXT:    v_and_b16 v19.l, v22.l, 1
 ; GFX1250-TRUE16-NEXT:    v_and_b16 v18.l, v18.l, 1
@@ -3819,6 +3899,7 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
 ;
 ; GFX1250-FAKE16-LABEL: amdgpu_cs_v32i1:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
 ; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v3, 3, v3
@@ -3930,6 +4011,7 @@ define amdgpu_cs void @amdgpu_cs_inreg_i1(i1 inreg %arg0) {
 ;
 ; GFX1250-LABEL: amdgpu_cs_inreg_i1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_and_b32 s0, s0, 1
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
@@ -4026,6 +4108,7 @@ define amdgpu_cs void @amdgpu_cs_inreg_v8i1(<8 x i1> inreg %arg0) {
 ;
 ; GFX1250-LABEL: amdgpu_cs_inreg_v8i1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_and_b32 s6, s6, 1
 ; GFX1250-NEXT:    s_lshl_b32 s5, s5, 1
 ; GFX1250-NEXT:    s_and_b32 s4, s4, 1
@@ -4214,6 +4297,7 @@ define amdgpu_cs void @amdgpu_cs_inreg_v16i1(<16 x i1> inreg %arg0) {
 ;
 ; GFX1250-LABEL: amdgpu_cs_inreg_v16i1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_and_b32 s10, s10, 1
 ; GFX1250-NEXT:    s_lshl_b32 s9, s9, 1
 ; GFX1250-NEXT:    s_and_b32 s8, s8, 1
@@ -4570,6 +4654,7 @@ define amdgpu_cs void @amdgpu_cs_inreg_v32i1(<32 x i1> inreg %arg0) {
 ;
 ; GFX1250-LABEL: amdgpu_cs_inreg_v32i1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_and_b32 s10, s10, 1
 ; GFX1250-NEXT:    s_lshl_b32 s9, s9, 1
 ; GFX1250-NEXT:    s_and_b32 s8, s8, 1
@@ -4694,6 +4779,7 @@ define amdgpu_cs void @amdgpu_cs_i1_sext(i1 signext %arg0) {
 ;
 ; GFX1250-LABEL: amdgpu_cs_i1_sext:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX1250-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX1250-NEXT:    s_endpgm
@@ -4721,6 +4807,7 @@ define amdgpu_cs void @amdgpu_cs_i1_zext(i1 zeroext %arg0) {
 ;
 ; GFX1250-LABEL: amdgpu_cs_i1_zext:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX1250-NEXT:    s_endpgm
   store i1 %arg0, ptr addrspace(1) poison
@@ -4728,3 +4815,5 @@ define amdgpu_cs void @amdgpu_cs_i1_zext(i1 zeroext %arg0) {
 }
 
 attributes #0 = { nounwind noinline }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 8d05317162e9c..a8ef781b62a42 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -117,6 +117,7 @@ define amdgpu_kernel void @sadd64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ;
 ; GFX1250-LABEL: sadd64rr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
@@ -228,6 +229,7 @@ define amdgpu_kernel void @sadd64ri(ptr addrspace(1) %out, i64 %a) {
 ;
 ; GFX1250-LABEL: sadd64ri:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -329,6 +331,7 @@ define amdgpu_kernel void @vadd64rr(ptr addrspace(1) %out, i64 %a) {
 ;
 ; GFX1250-LABEL: vadd64rr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
@@ -430,6 +433,7 @@ define amdgpu_kernel void @vadd64ri(ptr addrspace(1) %out) {
 ;
 ; GFX1250-LABEL: vadd64ri:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
@@ -536,6 +540,7 @@ define amdgpu_kernel void @suaddo32(ptr addrspace(1) %out, ptr addrspace(1) %car
 ;
 ; GFX1250-LABEL: suaddo32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
@@ -668,6 +673,7 @@ define amdgpu_kernel void @uaddo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX1250-LABEL: uaddo32_vcc_user:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -813,6 +819,7 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ;
 ; GFX1250-LABEL: suaddo64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_add_co_u32 s0, s12, s14
@@ -955,6 +962,7 @@ define amdgpu_kernel void @vuaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ;
 ; GFX1250-LABEL: vuaddo64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1086,6 +1094,7 @@ define amdgpu_kernel void @ssub64rr(ptr addrspace(1) %out, i64 %a, i64 %b) {
 ;
 ; GFX1250-LABEL: ssub64rr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
@@ -1197,6 +1206,7 @@ define amdgpu_kernel void @ssub64ri(ptr addrspace(1) %out, i64 %a) {
 ;
 ; GFX1250-LABEL: ssub64ri:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1298,6 +1308,7 @@ define amdgpu_kernel void @vsub64rr(ptr addrspace(1) %out, i64 %a) {
 ;
 ; GFX1250-LABEL: vsub64rr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
@@ -1399,6 +1410,7 @@ define amdgpu_kernel void @vsub64ri(ptr addrspace(1) %out) {
 ;
 ; GFX1250-LABEL: vsub64ri:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
@@ -1506,6 +1518,7 @@ define amdgpu_kernel void @susubo32(ptr addrspace(1) %out, ptr addrspace(1) %car
 ;
 ; GFX1250-LABEL: susubo32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x24
@@ -1638,6 +1651,7 @@ define amdgpu_kernel void @usubo32_vcc_user(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX1250-LABEL: usubo32_vcc_user:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1783,6 +1797,7 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ;
 ; GFX1250-LABEL: susubo64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_sub_co_u32 s0, s12, s14
@@ -1925,6 +1940,7 @@ define amdgpu_kernel void @vusubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ;
 ; GFX1250-LABEL: vusubo64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -3041,6 +3057,7 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ;
 ; GFX1250-LABEL: sudiv64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34

diff  --git a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
index 79b44d6a92caa..7ccf26f7eaad3 100644
--- a/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
+++ b/llvm/test/CodeGen/AMDGPU/code-size-estimate.ll
@@ -581,6 +581,7 @@ define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
 ;
 ; GFX1250-LABEL: s_fmaak_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; encoding: [0x41,0x06,0x80,0xb9,0x01,0x00,0x00,0x00]
 ; GFX1250-NEXT:    s_fmaak_f32 s0, s0, s1, 0x43800000 ; encoding: [0x00,0x01,0x80,0xa2,0x00,0x00,0x80,0x43]
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) ; encoding: [0x0b,0x00,0x87,0xbf]
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, s0 ; encoding: [0x00,0x02,0x00,0x7e]
@@ -594,7 +595,7 @@ define amdgpu_ps float @s_fmaak_f32(float inreg %x, float inreg %y) {
 ; GFX1100: codeLenInByte = 16
 ; GFX1150: codeLenInByte = 16
 ; GFX1200: codeLenInByte = 16
-; GFX1250: codeLenInByte = 16
+; GFX1250: codeLenInByte = 24
 
 define double @v_mul_f64_vop2_literal_32(double %x) {
 ; GFX9-LABEL: v_mul_f64_vop2_literal_32:

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_read2-gfx1250.ll b/llvm/test/CodeGen/AMDGPU/ds_read2-gfx1250.ll
index 23d2b18f5311b..8116078c9ee71 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_read2-gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_read2-gfx1250.ll
@@ -10,6 +10,7 @@
 define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 {
 ; GFX1250-LABEL: simple_read2_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -35,6 +36,7 @@ define amdgpu_kernel void @simple_read2_f32(ptr addrspace(1) %out) #0 {
 define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0 {
 ; GFX1250-LABEL: simple_read2_f32_max_offset:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -60,6 +62,7 @@ define amdgpu_kernel void @simple_read2_f32_max_offset(ptr addrspace(1) %out) #0
 define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 {
 ; GFX1250-LABEL: simple_read2_f32_too_far:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -86,6 +89,7 @@ define amdgpu_kernel void @simple_read2_f32_too_far(ptr addrspace(1) %out) #0 {
 define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 {
 ; GFX1250-LABEL: simple_read2_f32_x2:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
@@ -127,6 +131,7 @@ define amdgpu_kernel void @simple_read2_f32_x2(ptr addrspace(1) %out) #0 {
 define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0 {
 ; GFX1250-LABEL: simple_read2_f32_x2_barrier:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -174,6 +179,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_barrier(ptr addrspace(1) %out) #0
 define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %out) #0 {
 ; GFX1250-LABEL: simple_read2_f32_x2_nonzero_base:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
@@ -219,6 +225,7 @@ define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(ptr addrspace(1) %ou
 define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 {
 ; GFX1250-LABEL: read2_ptr_is_subreg_arg_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -251,6 +258,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(ptr addrspace(1) %out, <2
 define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %out, <2 x ptr addrspace(3)> %lds.ptr) #0 {
 ; GFX1250-LABEL: read2_ptr_is_subreg_arg_offset_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -283,6 +291,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(ptr addrspace(1) %
 define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 {
 ; GFX1250-LABEL: read2_ptr_is_subreg_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -314,6 +323,7 @@ define amdgpu_kernel void @read2_ptr_is_subreg_f32(ptr addrspace(1) %out) #0 {
 define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0 {
 ; GFX1250-LABEL: simple_read2_f32_volatile_0:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -340,6 +350,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_0(ptr addrspace(1) %out) #0
 define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0 {
 ; GFX1250-LABEL: simple_read2_f32_volatile_1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -367,6 +378,7 @@ define amdgpu_kernel void @simple_read2_f32_volatile_1(ptr addrspace(1) %out) #0
 define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 ; GFX1250-LABEL: unaligned_read2_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -393,6 +405,7 @@ define amdgpu_kernel void @unaligned_read2_f32(ptr addrspace(1) %out, ptr addrsp
 define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 ; GFX1250-LABEL: unaligned_offset_read2_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -419,6 +432,7 @@ define amdgpu_kernel void @unaligned_offset_read2_f32(ptr addrspace(1) %out, ptr
 define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 ; GFX1250-LABEL: misaligned_2_simple_read2_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -445,12 +459,14 @@ define amdgpu_kernel void @misaligned_2_simple_read2_f32(ptr addrspace(1) %out,
 define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 {
 ; GFX1250-LABEL: simple_read2_f64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_and_b32_e32 v4, 0x1ff8, v0
 ; GFX1250-NEXT:    ds_load_2addr_b64 v[0:3], v4 offset1:8
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-NEXT:    v_add_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
@@ -470,12 +486,14 @@ define amdgpu_kernel void @simple_read2_f64(ptr addrspace(1) %out) #0 {
 define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0 {
 ; GFX1250-LABEL: simple_read2_f64_max_offset:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_and_b32_e32 v4, 0x1ff8, v0
 ; GFX1250-NEXT:    ds_load_2addr_b64 v[0:3], v4 offset1:255
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-NEXT:    v_add_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
@@ -495,6 +513,7 @@ define amdgpu_kernel void @simple_read2_f64_max_offset(ptr addrspace(1) %out) #0
 define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 {
 ; GFX1250-LABEL: simple_read2_f64_too_far:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -502,6 +521,7 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 {
 ; GFX1250-NEXT:    ds_load_b64 v[0:1], v4
 ; GFX1250-NEXT:    ds_load_b64 v[2:3], v4 offset:2056
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-NEXT:    v_add_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
@@ -522,6 +542,7 @@ define amdgpu_kernel void @simple_read2_f64_too_far(ptr addrspace(1) %out) #0 {
 define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrspace(3) %lds) #0 {
 ; GFX1250-LABEL: misaligned_read2_f64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
@@ -531,6 +552,7 @@ define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrs
 ; GFX1250-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:1
 ; GFX1250-NEXT:    ds_load_2addr_b32 v[2:3], v2 offset0:14 offset1:15
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-NEXT:    v_add_f64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX1250-NEXT:    global_store_b64 v4, v[0:1], s[0:1]
 ; GFX1250-NEXT:    s_endpgm
@@ -551,6 +573,7 @@ define amdgpu_kernel void @misaligned_read2_f64(ptr addrspace(1) %out, ptr addrs
 define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out) {
 ; GFX1250-LABEL: load_constant_adjacent_offsets:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    ds_load_b64 v[0:1], v2
@@ -569,6 +592,7 @@ define amdgpu_kernel void @load_constant_adjacent_offsets(ptr addrspace(1) %out)
 define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out) {
 ; GFX1250-LABEL: load_constant_disjoint_offsets:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    ds_load_2addr_b32 v[0:1], v2 offset1:2
@@ -589,6 +613,7 @@ define amdgpu_kernel void @load_constant_disjoint_offsets(ptr addrspace(1) %out)
 define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) %out) {
 ; GFX1250-LABEL: load_misaligned64_constant_offsets:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    ds_load_b128 v[0:3], v4
@@ -609,6 +634,7 @@ define amdgpu_kernel void @load_misaligned64_constant_offsets(ptr addrspace(1) %
 define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspace(1) %out) {
 ; GFX1250-LABEL: load_misaligned64_constant_large_offsets:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    ds_load_b64 v[0:1], v4 offset:16384
@@ -631,6 +657,7 @@ define amdgpu_kernel void @load_misaligned64_constant_large_offsets(ptr addrspac
 define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C, i32 %lda, i32 %ldb) #0 {
 ; GFX1250-LABEL: sgemm_inner_loop_read2_sequence:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
 ; GFX1250-NEXT:    s_and_b32 s1, ttmp6, 15
 ; GFX1250-NEXT:    s_add_co_i32 s0, s0, 1
@@ -719,6 +746,7 @@ define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(ptr addrspace(1) %C,
 define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
 ; GFX1250-LABEL: misaligned_read2_v2i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
@@ -734,6 +762,7 @@ define amdgpu_kernel void @misaligned_read2_v2i32(ptr addrspace(1) %out, ptr add
 define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrspace(3) %in) #0 {
 ; GFX1250-LABEL: misaligned_read2_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
@@ -749,6 +778,7 @@ define amdgpu_kernel void @misaligned_read2_i64(ptr addrspace(1) %out, ptr addrs
 define amdgpu_kernel void @ds_read_
diff _base_interleaving(
 ; GFX1250-LABEL: ds_read_
diff _base_interleaving:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
 ; GFX1250-NEXT:    v_dual_lshrrev_b32 v1, 6, v0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
@@ -815,6 +845,8 @@ bb:
 define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspace(3) %arg) {
 ; GFX1250-LABEL: ds_read_call_read:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mov_b32 s32, 0
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[36:38], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX1250-NEXT:    v_dual_mov_b32 v42, 0 :: v_dual_mov_b32 v31, v0
@@ -824,7 +856,6 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1250-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1250-NEXT:    s_mov_b32 s32, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_lshl_add_u32 v40, v1, 2, s38
 ; GFX1250-NEXT:    ds_load_b32 v41, v40
@@ -850,6 +881,7 @@ define amdgpu_kernel void @ds_read_call_read(ptr addrspace(1) %out, ptr addrspac
 define amdgpu_kernel void @read2_v2i32_align1_odd_offset(ptr addrspace(1) %out) {
 ; GFX1250-LABEL: read2_v2i32_align1_odd_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    ds_load_b64 v[0:1], v2 offset:65

diff  --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
index 22a26f373927e..1684437eff580 100644
--- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll
@@ -34,6 +34,7 @@ define amdgpu_kernel void @simple_write2_one_val_f32(ptr addrspace(1) %C, ptr ad
 ;
 ; GFX1250-LABEL: simple_write2_one_val_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -85,6 +86,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32(ptr addrspace(1) %C, ptr ad
 ;
 ; GFX1250-LABEL: simple_write2_two_val_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -145,6 +147,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(ptr addrspace(1)
 ;
 ; GFX1250-LABEL: simple_write2_two_val_f32_volatile_0:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -206,6 +209,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(ptr addrspace(1)
 ;
 ; GFX1250-LABEL: simple_write2_two_val_f32_volatile_1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -271,6 +275,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(ptr addrspace
 ;
 ; GFX1250-LABEL: simple_write2_two_val_subreg2_mixed_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -325,6 +330,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(ptr addrspace(1) %C
 ;
 ; GFX1250-LABEL: simple_write2_two_val_subreg2_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -376,6 +382,7 @@ define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(ptr addrspace(1) %C
 ;
 ; GFX1250-LABEL: simple_write2_two_val_subreg4_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -428,6 +435,7 @@ define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(ptr addrspace(1)
 ;
 ; GFX1250-LABEL: simple_write2_two_val_max_offset_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -488,6 +496,7 @@ define amdgpu_kernel void @simple_write2_two_val_too_far_f32(ptr addrspace(1) %C
 ;
 ; GFX1250-LABEL: simple_write2_two_val_too_far_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -548,6 +557,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2(ptr addrspace(1) %C, ptr
 ;
 ; GFX1250-LABEL: simple_write2_two_val_f32_x2:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -619,6 +629,7 @@ define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(ptr addrspa
 ;
 ; GFX1250-LABEL: simple_write2_two_val_f32_x2_nonzero_base:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -698,6 +709,7 @@ define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(ptr addrspace(1) %C
 ;
 ; GFX1250-LABEL: write2_ptr_subreg_arg_two_val_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -761,6 +773,7 @@ define amdgpu_kernel void @simple_write2_one_val_f64(ptr addrspace(1) %C, ptr ad
 ;
 ; GFX1250-LABEL: simple_write2_one_val_f64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -814,6 +827,7 @@ define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(ptr addrspace(1)
 ;
 ; GFX1250-LABEL: misaligned_simple_write2_one_val_f64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x8
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -913,6 +927,7 @@ define amdgpu_kernel void @unaligned_offset_simple_write2_one_val_f64(ptr addrsp
 ;
 ; GFX1250-LABEL: unaligned_offset_simple_write2_one_val_f64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x8
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -967,6 +982,7 @@ define amdgpu_kernel void @simple_write2_two_val_f64(ptr addrspace(1) %C, ptr ad
 ;
 ; GFX1250-LABEL: simple_write2_two_val_f64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -1013,6 +1029,7 @@ define amdgpu_kernel void @store_constant_adjacent_offsets() {
 ;
 ; GFX1250-LABEL: store_constant_adjacent_offsets:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 0x7b0000007b
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    ds_store_b64 v2, v[0:1]
@@ -1040,6 +1057,7 @@ define amdgpu_kernel void @store_constant_disjoint_offsets() {
 ;
 ; GFX1250-LABEL: store_constant_disjoint_offsets:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
 ; GFX1250-NEXT:    ds_store_2addr_b32 v1, v0, v0 offset1:2
 ; GFX1250-NEXT:    s_endpgm
@@ -1072,6 +1090,7 @@ define amdgpu_kernel void @store_misaligned64_constant_offsets() {
 ;
 ; GFX1250-LABEL: store_misaligned64_constant_offsets:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, v1
@@ -1108,6 +1127,7 @@ define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
 ;
 ; GFX1250-LABEL: store_misaligned64_constant_large_offsets:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 0x7b
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    ds_store_b64 v2, v[0:1] offset:16384
@@ -1167,6 +1187,7 @@ define amdgpu_kernel void @write2_sgemm_sequence(ptr addrspace(1) %C, i32 %lda,
 ;
 ; GFX1250-LABEL: write2_sgemm_sequence:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x10
 ; GFX1250-NEXT:    s_and_b32 s2, ttmp6, 15
 ; GFX1250-NEXT:    s_getreg_b32 s3, hwreg(HW_REG_IB_STS2, 6, 4)
@@ -1283,6 +1304,7 @@ define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(ptr addrspace(3)
 ;
 ; GFX1250-LABEL: simple_write2_v4f32_superreg_align4:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x8
 ; GFX1250-NEXT:    s_load_b32 s8, s[4:5], 0x0
@@ -1351,6 +1373,7 @@ define amdgpu_kernel void @write2_v2i32_align1_odd_offset() {
 ;
 ; GFX1250-LABEL: write2_v2i32_align1_odd_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 0x1c80000007b
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    ds_store_b64 v2, v[0:1] offset:65

diff  --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll
index d747fb7cce7dc..b23645839944e 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.bf16.ll
@@ -18,6 +18,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 define amdgpu_kernel void @test_fold_canonicalize_undef_value_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_undef_value_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -31,6 +32,7 @@ define amdgpu_kernel void @test_fold_canonicalize_undef_value_bf16(ptr addrspace
 define amdgpu_kernel void @v_test_canonicalize_var_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: v_test_canonicalize_var_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -51,6 +53,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_bf16(ptr addrspace(1) %out) #
 define amdgpu_kernel void @s_test_canonicalize_var_bf16(ptr addrspace(1) %out, i16 zeroext %val.arg) #1 {
 ; GFX1250-LABEL: s_test_canonicalize_var_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -86,6 +89,7 @@ define <2 x bfloat> @v_test_canonicalize_build_vector_v2bf16(bfloat %lo, bfloat
 define amdgpu_kernel void @v_test_canonicalize_fabs_var_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: v_test_canonicalize_fabs_var_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -110,6 +114,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_bf16(ptr addrspace(1) %o
 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -134,6 +139,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_bf16(ptr addrspace(
 define amdgpu_kernel void @v_test_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: v_test_canonicalize_fneg_var_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -157,6 +163,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_bf16(ptr addrspace(1) %o
 define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_bf16(ptr addrspace(1) %out) #2 {
 ; GFX1250-LABEL: v_test_no_denormals_canonicalize_fneg_var_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -180,6 +187,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_bf16(ptr ad
 define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_bf16(ptr addrspace(1) %out) #2 {
 ; GFX1250-LABEL: v_test_no_denormals_canonicalize_fneg_fabs_var_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -204,6 +212,7 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_bf16(p
 define amdgpu_kernel void @test_fold_canonicalize_p0_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_p0_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -217,6 +226,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_bf16(ptr addrspace(1) %out)
 define amdgpu_kernel void @test_fold_canonicalize_n0_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_n0_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff8000
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -230,6 +240,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_bf16(ptr addrspace(1) %out)
 define amdgpu_kernel void @test_fold_canonicalize_p1_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_p1_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f80
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -243,6 +254,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_bf16(ptr addrspace(1) %out)
 define amdgpu_kernel void @test_fold_canonicalize_n1_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_n1_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffffbf80
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -256,6 +268,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_bf16(ptr addrspace(1) %out)
 define amdgpu_kernel void @test_fold_canonicalize_literal_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_literal_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x4180
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -269,6 +282,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_bf16(ptr addrspace(1)
 define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal0_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -282,6 +296,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_bf
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_bf16(ptr addrspace(1) %out) #3 {
 ; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -295,6 +310,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_bf16(ptr a
 define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_default_denormals_fold_canonicalize_denormal1_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -308,6 +324,7 @@ define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_bf
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_bf16(ptr addrspace(1) %out) #3 {
 ; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xffff83ff
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -321,6 +338,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_bf16(ptr a
 define amdgpu_kernel void @test_fold_canonicalize_qnan_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_qnan_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c00
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -334,6 +352,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_bf16(ptr addrspace(1) %ou
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -347,6 +366,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_bf16(ptr addrs
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -360,6 +380,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_bf16(ptr addrs
 define amdgpu_kernel void @test_fold_canonicalize_snan0_value_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_snan0_value_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c01
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -373,6 +394,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_bf16(ptr addrspace
 define amdgpu_kernel void @test_fold_canonicalize_snan1_value_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_snan1_value_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -386,6 +408,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_bf16(ptr addrspace
 define amdgpu_kernel void @test_fold_canonicalize_snan2_value_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_snan2_value_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffdff
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -399,6 +422,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_bf16(ptr addrspace
 define amdgpu_kernel void @test_fold_canonicalize_snan3_value_bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_snan3_value_bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfffffc01
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -412,6 +436,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan3_value_bf16(ptr addrspace
 define amdgpu_kernel void @v_test_canonicalize_var_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: v_test_canonicalize_var_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -437,6 +462,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2bf16(ptr addrspace(1) %out)
 define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: v_test_canonicalize_fabs_var_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -467,6 +493,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2bf16(ptr addrspace(1)
 define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: v_test_canonicalize_fneg_fabs_var_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -498,6 +525,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2bf16(ptr addrspac
 define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: v_test_canonicalize_fneg_var_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -528,6 +556,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2bf16(ptr addrspace(1)
 define amdgpu_kernel void @s_test_canonicalize_var_v2bf16(ptr addrspace(1) %out, i32 zeroext %val.arg) #1 {
 ; GFX1250-LABEL: s_test_canonicalize_var_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -548,6 +577,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2bf16(ptr addrspace(1) %out,
 define amdgpu_kernel void @test_fold_canonicalize_p0_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_p0_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -561,6 +591,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p0_v2bf16(ptr addrspace(1) %ou
 define amdgpu_kernel void @test_fold_canonicalize_n0_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_n0_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x80008000
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -574,6 +605,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n0_v2bf16(ptr addrspace(1) %ou
 define amdgpu_kernel void @test_fold_canonicalize_p1_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_p1_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3f803f80
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -587,6 +619,7 @@ define amdgpu_kernel void @test_fold_canonicalize_p1_v2bf16(ptr addrspace(1) %ou
 define amdgpu_kernel void @test_fold_canonicalize_n1_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_n1_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xbf80bf80
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -600,6 +633,7 @@ define amdgpu_kernel void @test_fold_canonicalize_n1_v2bf16(ptr addrspace(1) %ou
 define amdgpu_kernel void @test_fold_canonicalize_literal_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_literal_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x41804180
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -613,6 +647,7 @@ define amdgpu_kernel void @test_fold_canonicalize_literal_v2bf16(ptr addrspace(1
 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_no_denormals_fold_canonicalize_denormal0_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -626,6 +661,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2bf16(
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2bf16(ptr addrspace(1) %out) #3 {
 ; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal0_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3ff03ff
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -639,6 +675,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2bf16(ptr
 define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_no_denormals_fold_canonicalize_denormal1_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -652,6 +689,7 @@ define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2bf16(
 define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2bf16(ptr addrspace(1) %out) #3 {
 ; GFX1250-LABEL: test_denormals_fold_canonicalize_denormal1_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x83ff83ff
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -665,6 +703,7 @@ define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2bf16(ptr
 define amdgpu_kernel void @test_fold_canonicalize_qnan_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_qnan_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c007c00
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -678,6 +717,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_v2bf16(ptr addrspace(1) %
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg1_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc07fc0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -691,6 +731,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2bf16(ptr add
 define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_qnan_value_neg2_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc07fc0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -704,6 +745,7 @@ define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2bf16(ptr add
 define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_snan0_value_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7c017c01
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -717,6 +759,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2bf16(ptr addrspa
 define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_snan1_value_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7dff7dff
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -730,6 +773,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2bf16(ptr addrspa
 define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_snan2_value_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfdfffdff
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -743,6 +787,7 @@ define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2bf16(ptr addrspa
 define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: test_fold_canonicalize_snan3_value_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0xfc01fc01
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -793,6 +838,7 @@ define <4 x bfloat> @v_test_canonicalize_var_v4bf16(<4 x bfloat> %val) #1 {
 define amdgpu_kernel void @s_test_canonicalize_undef_v2bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: s_test_canonicalize_undef_v2bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -928,6 +974,7 @@ define <2 x bfloat> @v_test_canonicalize_k_reg_v2bf16(bfloat %val) #1 {
 define amdgpu_kernel void @s_test_canonicalize_undef_v4bf16(ptr addrspace(1) %out) #1 {
 ; GFX1250-LABEL: s_test_canonicalize_undef_v4bf16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
index 213233e802a96..29aad9c5f9dc1 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-load-saddr-to-vaddr.ll
@@ -13,6 +13,7 @@
 define amdgpu_kernel void @test_move_load_address_to_vgpr(ptr addrspace(1) nocapture %arg1, ptr nocapture %arg2) {
 ; GCN-LABEL: test_move_load_address_to_vgpr:
 ; GCN:       ; %bb.0: ; %bb
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GCN-NEXT:    v_mov_b32_e32 v3, 0
 ; GCN-NEXT:    s_wait_kmcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
index 0c2d51dc9b0c2..6ae3cfb7e106a 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-atomics.ll
@@ -10,6 +10,7 @@
 define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v1, s[2:3] scope:SCOPE_DEV
@@ -49,6 +50,7 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_2047:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v1, s[2:3] offset:2047 scope:SCOPE_DEV
@@ -89,6 +91,7 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_2047(ptr inreg %sbase, i
 define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_xchg_saddr_i32_nortn_offset_neg2048:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v1, s[2:3] offset:-2048 scope:SCOPE_DEV
@@ -134,6 +137,7 @@ define amdgpu_ps void @flat_xchg_saddr_i32_nortn_offset_neg2048(ptr inreg %sbase
 define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -174,6 +178,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_2048:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -215,6 +220,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_2048(ptr inreg %sbase, i32 %voff
 define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_xchg_saddr_i32_rtn_neg2048:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_swap_b32 v0, v0, v1, s[2:3] offset:-2048 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -269,6 +275,7 @@ define amdgpu_ps float @flat_xchg_saddr_i32_rtn_neg2048(ptr inreg %sbase, i32 %v
 define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i32 %data) {
 ; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v2
 ; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
@@ -284,6 +291,7 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v2
 ; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
@@ -336,6 +344,7 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn(i32 %voffset, i
 define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 %voffset, i32 %data) {
 ; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v2
 ; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
@@ -351,6 +360,7 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 %
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v2
 ; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
@@ -404,6 +414,7 @@ define amdgpu_ps float @flat_xchg_saddr_uniform_ptr_in_vgprs_rtn_immoffset(i32 %
 define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset, i32 %data) {
 ; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v2
 ; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
@@ -418,6 +429,7 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v2
 ; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
@@ -468,6 +480,7 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn(i32 %voffset,
 define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32 %voffset, i32 %data) {
 ; GFX1250-SDAG-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v2
 ; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
@@ -482,6 +495,7 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v2
 ; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
@@ -540,6 +554,7 @@ define amdgpu_ps void @flat_xchg_saddr_uniform_ptr_in_vgprs_nortn_immoffset(i32
 define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -584,6 +599,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -717,6 +733,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_rtn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -764,6 +781,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -907,6 +925,7 @@ define amdgpu_ps <2 x float> @flat_xchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_nortn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
@@ -943,6 +962,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -1051,6 +1071,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_xchg_saddr_i64_nortn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -1089,6 +1110,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_xchg_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -1212,6 +1234,7 @@ define amdgpu_ps void @flat_xchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_add_saddr_i32_rtn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_add_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -1252,6 +1275,7 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
 define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_add_saddr_i32_rtn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_add_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -1299,6 +1323,7 @@ define amdgpu_ps float @flat_add_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
 define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_add_saddr_i32_nortn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_add_u32 v0, v1, s[2:3] scope:SCOPE_DEV
@@ -1337,6 +1362,7 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_add_saddr_i32_nortn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_add_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
@@ -1382,6 +1408,7 @@ define amdgpu_ps void @flat_add_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1426,6 +1453,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1563,6 +1591,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_rtn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -1610,6 +1639,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -1757,6 +1787,7 @@ define amdgpu_ps <2 x float> @flat_add_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_nortn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
@@ -1796,6 +1827,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -1915,6 +1947,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_add_saddr_i64_nortn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -1956,6 +1989,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_add_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -2090,6 +2124,7 @@ define amdgpu_ps void @flat_add_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_sub_saddr_i32_rtn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_sub_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -2130,6 +2165,7 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
 define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_sub_saddr_i32_rtn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_sub_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -2177,6 +2213,7 @@ define amdgpu_ps float @flat_sub_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
 define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_sub_saddr_i32_nortn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_sub_u32 v0, v1, s[2:3] scope:SCOPE_DEV
@@ -2215,6 +2252,7 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_sub_saddr_i32_nortn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_sub_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
@@ -2260,6 +2298,7 @@ define amdgpu_ps void @flat_sub_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2304,6 +2343,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2443,6 +2483,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_rtn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -2490,6 +2531,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -2639,6 +2681,7 @@ define amdgpu_ps <2 x float> @flat_sub_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_nortn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
@@ -2678,6 +2721,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -2799,6 +2843,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_sub_saddr_i64_nortn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -2840,6 +2885,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_sub_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -2976,6 +3022,7 @@ define amdgpu_ps void @flat_sub_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_and_saddr_i32_rtn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_and_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -3016,6 +3063,7 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
 define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_and_saddr_i32_rtn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_and_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -3063,6 +3111,7 @@ define amdgpu_ps float @flat_and_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
 define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_and_saddr_i32_nortn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_and_b32 v0, v1, s[2:3] scope:SCOPE_DEV
@@ -3101,6 +3150,7 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_and_saddr_i32_nortn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_and_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
@@ -3146,6 +3196,7 @@ define amdgpu_ps void @flat_and_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3191,6 +3242,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3329,6 +3381,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_rtn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -3377,6 +3430,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -3525,6 +3579,7 @@ define amdgpu_ps <2 x float> @flat_and_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_nortn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
@@ -3565,6 +3620,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -3685,6 +3741,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_and_saddr_i64_nortn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -3727,6 +3784,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_and_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -3862,6 +3920,7 @@ define amdgpu_ps void @flat_and_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_or_saddr_i32_rtn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_or_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -3902,6 +3961,7 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i3
 define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_or_saddr_i32_rtn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_or_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -3949,6 +4009,7 @@ define amdgpu_ps float @flat_or_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voff
 define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_or_saddr_i32_nortn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_or_b32 v0, v1, s[2:3] scope:SCOPE_DEV
@@ -3987,6 +4048,7 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i
 define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_or_saddr_i32_nortn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_or_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
@@ -4032,6 +4094,7 @@ define amdgpu_ps void @flat_or_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vof
 define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4077,6 +4140,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
 ;
 ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4215,6 +4279,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn(ptr inreg %sbase, i32 %voffs
 define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_rtn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -4263,6 +4328,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 ;
 ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4411,6 +4477,7 @@ define amdgpu_ps <2 x float> @flat_or_saddr_i64_rtn_neg128(ptr inreg %sbase, i32
 define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_nortn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
@@ -4451,6 +4518,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 ;
 ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -4571,6 +4639,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i
 define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_or_saddr_i64_nortn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -4613,6 +4682,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
 ;
 ; GFX1250-GISEL-LABEL: flat_or_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -4748,6 +4818,7 @@ define amdgpu_ps void @flat_or_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vof
 define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_xor_saddr_i32_rtn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_xor_b32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -4788,6 +4859,7 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
 define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_xor_saddr_i32_rtn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_xor_b32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
@@ -4835,6 +4907,7 @@ define amdgpu_ps float @flat_xor_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
 define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_xor_saddr_i32_nortn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_xor_b32 v0, v1, s[2:3] scope:SCOPE_DEV
@@ -4873,6 +4946,7 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_xor_saddr_i32_nortn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_atomic_xor_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
@@ -4918,6 +4992,7 @@ define amdgpu_ps void @flat_xor_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -4963,6 +5038,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5101,6 +5177,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_rtn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -5149,6 +5226,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5297,6 +5375,7 @@ define amdgpu_ps <2 x float> @flat_xor_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_nortn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
@@ -5337,6 +5416,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -5457,6 +5537,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_xor_saddr_i64_nortn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -5499,6 +5580,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_xor_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -5634,6 +5716,7 @@ define amdgpu_ps void @flat_xor_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps float @flat_max_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_max_saddr_i32_rtn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_max_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -5666,6 +5749,7 @@ define amdgpu_ps float @flat_max_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
 define amdgpu_ps float @flat_max_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_max_saddr_i32_rtn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_max_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -5705,6 +5789,7 @@ define amdgpu_ps float @flat_max_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
 define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_max_saddr_i32_nortn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_max_i32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -5736,6 +5821,7 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_max_saddr_i32_nortn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_max_i32 v0, v1, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -5774,6 +5860,7 @@ define amdgpu_ps void @flat_max_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5814,6 +5901,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -5949,6 +6037,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_rtn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -5992,6 +6081,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -6137,6 +6227,7 @@ define amdgpu_ps <2 x float> @flat_max_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_nortn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
@@ -6173,6 +6264,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -6289,6 +6381,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_max_saddr_i64_nortn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -6327,6 +6420,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_max_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -6458,6 +6552,7 @@ define amdgpu_ps void @flat_max_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps float @flat_min_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_min_saddr_i32_rtn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_min_i32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -6490,6 +6585,7 @@ define amdgpu_ps float @flat_min_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
 define amdgpu_ps float @flat_min_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_min_saddr_i32_rtn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_min_i32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -6529,6 +6625,7 @@ define amdgpu_ps float @flat_min_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
 define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_min_saddr_i32_nortn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_min_i32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -6560,6 +6657,7 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_min_saddr_i32_nortn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_min_i32 v0, v1, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -6598,6 +6696,7 @@ define amdgpu_ps void @flat_min_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -6638,6 +6737,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -6773,6 +6873,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_rtn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -6816,6 +6917,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -6961,6 +7063,7 @@ define amdgpu_ps <2 x float> @flat_min_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_nortn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
@@ -6997,6 +7100,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -7113,6 +7217,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_min_saddr_i64_nortn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -7151,6 +7256,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_min_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -7282,6 +7388,7 @@ define amdgpu_ps void @flat_min_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps float @flat_umax_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_umax_saddr_i32_rtn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_max_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -7314,6 +7421,7 @@ define amdgpu_ps float @flat_umax_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps float @flat_umax_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_umax_saddr_i32_rtn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_max_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -7353,6 +7461,7 @@ define amdgpu_ps float @flat_umax_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_umax_saddr_i32_nortn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_max_u32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -7384,6 +7493,7 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_umax_saddr_i32_nortn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_max_u32 v0, v1, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -7422,6 +7532,7 @@ define amdgpu_ps void @flat_umax_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
 define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7462,6 +7573,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ;
 ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7597,6 +7709,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_rtn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -7640,6 +7753,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ;
 ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -7785,6 +7899,7 @@ define amdgpu_ps <2 x float> @flat_umax_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_nortn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
@@ -7821,6 +7936,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -7937,6 +8053,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_umax_saddr_i64_nortn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -7975,6 +8092,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_umax_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -8106,6 +8224,7 @@ define amdgpu_ps void @flat_umax_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 define amdgpu_ps float @flat_umin_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_umin_saddr_i32_rtn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_min_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -8138,6 +8257,7 @@ define amdgpu_ps float @flat_umin_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps float @flat_umin_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_umin_saddr_i32_rtn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_min_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -8177,6 +8297,7 @@ define amdgpu_ps float @flat_umin_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_umin_saddr_i32_nortn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_min_u32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -8208,6 +8329,7 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_umin_saddr_i32_nortn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_min_u32 v0, v1, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_storecnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -8246,6 +8368,7 @@ define amdgpu_ps void @flat_umin_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %v
 define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -8286,6 +8409,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 ;
 ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -8421,6 +8545,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn(ptr inreg %sbase, i32 %vof
 define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_rtn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -8464,6 +8589,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 ;
 ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -8609,6 +8735,7 @@ define amdgpu_ps <2 x float> @flat_umin_saddr_i64_rtn_neg128(ptr inreg %sbase, i
 define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_nortn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
@@ -8645,6 +8772,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -8761,6 +8889,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_umin_saddr_i64_nortn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -8799,6 +8928,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_umin_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -8930,6 +9060,7 @@ define amdgpu_ps void @flat_umin_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %v
 define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
 ; GFX1250-LABEL: flat_cmpxchg_saddr_i32_rtn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -8973,6 +9104,7 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn(ptr inreg %sbase, i32 %voffse
 define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
 ; GFX1250-LABEL: flat_cmpxchg_saddr_i32_rtn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -9023,6 +9155,7 @@ define amdgpu_ps float @flat_cmpxchg_saddr_i32_rtn_neg128(ptr inreg %sbase, i32
 define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
 ; GFX1250-LABEL: flat_cmpxchg_saddr_i32_nortn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -9063,6 +9196,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn(ptr inreg %sbase, i32 %voffs
 define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %cmp, i32 %data) {
 ; GFX1250-LABEL: flat_cmpxchg_saddr_i32_nortn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -9110,6 +9244,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i32_nortn_neg128(ptr inreg %sbase, i32
 define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v4, v3
@@ -9156,6 +9291,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
 ;
 ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
@@ -9304,6 +9440,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn(ptr inreg %sbase, i32 %
 define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -9353,6 +9490,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 ;
 ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v9, v2 :: v_dual_mov_b32 v6, v3
@@ -9511,6 +9649,7 @@ define amdgpu_ps <2 x float> @flat_cmpxchg_saddr_i64_rtn_neg128(ptr inreg %sbase
 define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v5, v4
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v4, v3
@@ -9552,6 +9691,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 ;
 ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
@@ -9681,6 +9821,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn(ptr inreg %sbase, i32 %voffs
 define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %cmp, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -9724,6 +9865,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
 ;
 ; GFX1250-GISEL-LABEL: flat_cmpxchg_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v1 :: v_dual_mov_b32 v9, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
@@ -9868,6 +10010,7 @@ define amdgpu_ps void @flat_cmpxchg_saddr_i64_nortn_neg128(ptr inreg %sbase, i32
 define amdgpu_ps float @flat_inc_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_inc_saddr_i32_rtn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_inc_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -9900,6 +10043,7 @@ define amdgpu_ps float @flat_inc_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
 define amdgpu_ps float @flat_inc_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_inc_saddr_i32_rtn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_inc_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -9939,6 +10083,7 @@ define amdgpu_ps float @flat_inc_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
 define amdgpu_ps void @flat_inc_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_inc_saddr_i32_nortn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_inc_u32 v0, v1, s[2:3] scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
 ;
@@ -9967,6 +10112,7 @@ define amdgpu_ps void @flat_inc_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_inc_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_inc_saddr_i32_nortn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_inc_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
 ;
@@ -10002,6 +10148,7 @@ define amdgpu_ps void @flat_inc_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10046,6 +10193,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10191,6 +10339,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_rtn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -10238,6 +10387,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10393,6 +10543,7 @@ define amdgpu_ps <2 x float> @flat_inc_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_nortn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
@@ -10430,6 +10581,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -10551,6 +10703,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_inc_saddr_i64_nortn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -10590,6 +10743,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_inc_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -10727,6 +10881,7 @@ define amdgpu_ps void @flat_inc_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps float @flat_dec_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_dec_saddr_i32_rtn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_dec_u32 v0, v0, v1, s[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -10759,6 +10914,7 @@ define amdgpu_ps float @flat_dec_saddr_i32_rtn(ptr inreg %sbase, i32 %voffset, i
 define amdgpu_ps float @flat_dec_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_dec_saddr_i32_rtn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_dec_u32 v0, v0, v1, s[2:3] offset:-128 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -10798,6 +10954,7 @@ define amdgpu_ps float @flat_dec_saddr_i32_rtn_neg128(ptr inreg %sbase, i32 %vof
 define amdgpu_ps void @flat_dec_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_dec_saddr_i32_nortn:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_dec_u32 v0, v1, s[2:3] scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
 ;
@@ -10826,6 +10983,7 @@ define amdgpu_ps void @flat_dec_saddr_i32_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_dec_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_dec_saddr_i32_nortn_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_atomic_dec_u32 v0, v1, s[2:3] offset:-128 scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
 ;
@@ -10861,6 +11019,7 @@ define amdgpu_ps void @flat_dec_saddr_i32_nortn_neg128(ptr inreg %sbase, i32 %vo
 define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -10908,6 +11067,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -11056,6 +11216,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn(ptr inreg %sbase, i32 %voff
 define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_rtn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -11106,6 +11267,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_rtn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, v1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -11264,6 +11426,7 @@ define amdgpu_ps <2 x float> @flat_dec_saddr_i64_rtn_neg128(ptr inreg %sbase, i3
 define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_nortn:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s0, exec_lo
@@ -11304,6 +11467,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 ;
 ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo
@@ -11428,6 +11592,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn(ptr inreg %sbase, i32 %voffset,
 define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_dec_saddr_i64_nortn_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0xffffffffffffff80
@@ -11470,6 +11635,7 @@ define amdgpu_ps void @flat_dec_saddr_i64_nortn_neg128(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_dec_saddr_i64_nortn_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, exec_lo

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index 90e0b194ae74d..8b7c49b5931af 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -15,6 +15,7 @@
 define amdgpu_ps float @flat_load_saddr_i8_offset_0(ptr inreg %sbase) {
 ; GFX1250-LABEL: flat_load_saddr_i8_offset_0:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -29,6 +30,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0(ptr inreg %sbase) {
 define amdgpu_ps float @flat_load_saddr_i8_offset_8388607(ptr inreg %sbase) {
 ; GFX1250-LABEL: flat_load_saddr_i8_offset_8388607:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:8388607
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -44,6 +46,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_8388607(ptr inreg %sbase) {
 define amdgpu_ps float @flat_load_saddr_i8_offset_8388608(ptr inreg %sbase) {
 ; GFX1250-LABEL: flat_load_saddr_i8_offset_8388608:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -59,6 +62,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_8388608(ptr inreg %sbase) {
 define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388608(ptr inreg %sbase) {
 ; GFX1250-LABEL: flat_load_saddr_i8_offset_neg8388608:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-8388608
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -74,6 +78,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388608(ptr inreg %sbase) {
 define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg8388609:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xff800000, s2
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s0
@@ -83,6 +88,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) {
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg8388609:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 0xff7fffff
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -93,6 +99,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) {
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_neg8388609:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_add_co_u32 v0, s0, 0xff800000, s2
 ; GFX1250-NOECC-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NOECC-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s0
@@ -109,6 +116,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg8388609(ptr inreg %sbase) {
 define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0xff800000, s2
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s3, s0
@@ -118,6 +126,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) {
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, -1
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 0
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -128,6 +137,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) {
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_0xFFFFFFFF:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_add_co_u32 v0, s0, 0xff800000, s2
 ; GFX1250-NOECC-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NOECC-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, s3, s0
@@ -144,6 +154,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0xFFFFFFFF(ptr inreg %sbase) {
 define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000000(ptr inreg %sbase) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000000:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-SDAG-NEXT:    s_add_co_i32 s3, s3, 1
 ; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3]
@@ -152,6 +163,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000000(ptr inreg %sbase)
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000000:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 0
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 1
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -162,6 +174,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000000(ptr inreg %sbase)
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_0x100000000:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NOECC-NEXT:    s_add_co_i32 s3, s3, 1
 ; GFX1250-NOECC-NEXT:    flat_load_u8 v0, v0, s[2:3]
@@ -177,6 +190,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000000(ptr inreg %sbase)
 define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000001(ptr inreg %sbase) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000001:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s2
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s0
@@ -186,6 +200,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000001(ptr inreg %sbase)
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000001:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 1
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 1
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -196,6 +211,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000001(ptr inreg %sbase)
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_0x100000001:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_add_co_u32 v0, s0, 0, s2
 ; GFX1250-NOECC-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NOECC-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s0
@@ -212,6 +228,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000001(ptr inreg %sbase)
 define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000FFF(ptr inreg %sbase) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100000FFF:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s2
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s0
@@ -221,6 +238,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000FFF(ptr inreg %sbase)
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100000FFF:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 0xfff
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 1
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -231,6 +249,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000FFF(ptr inreg %sbase)
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_0x100000FFF:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_add_co_u32 v0, s0, 0, s2
 ; GFX1250-NOECC-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NOECC-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s0
@@ -247,6 +266,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100000FFF(ptr inreg %sbase)
 define amdgpu_ps float @flat_load_saddr_i8_offset_0x100001000(ptr inreg %sbase) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_0x100001000:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s2
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s0
@@ -256,6 +276,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100001000(ptr inreg %sbase)
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_0x100001000:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 0x1000
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 1
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -266,6 +287,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100001000(ptr inreg %sbase)
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_0x100001000:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_add_co_u32 v0, s0, 0, s2
 ; GFX1250-NOECC-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NOECC-NEXT:    v_add_co_ci_u32_e64 v1, null, 1, s3, s0
@@ -282,6 +304,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_0x100001000(ptr inreg %sbase)
 define amdgpu_ps float @flat_load_saddr_i8_offset_neg0xFFFFFFFF(ptr inreg %sbase) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0xFFFFFFFF:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0x800000, s2
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s0
@@ -291,6 +314,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0xFFFFFFFF(ptr inreg %sbase
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0xFFFFFFFF:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 1
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -301,6 +325,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0xFFFFFFFF(ptr inreg %sbase
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_neg0xFFFFFFFF:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_add_co_u32 v0, s0, 0x800000, s2
 ; GFX1250-NOECC-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NOECC-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s0
@@ -317,6 +342,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0xFFFFFFFF(ptr inreg %sbase
 define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000000(ptr inreg %sbase) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0x100000000:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-SDAG-NEXT:    s_add_co_i32 s3, s3, -1
 ; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3]
@@ -325,6 +351,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000000(ptr inreg %sbas
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0x100000000:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, 0
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -1
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -335,6 +362,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000000(ptr inreg %sbas
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_neg0x100000000:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NOECC-NEXT:    s_add_co_i32 s3, s3, -1
 ; GFX1250-NOECC-NEXT:    flat_load_u8 v0, v0, s[2:3]
@@ -350,6 +378,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000000(ptr inreg %sbas
 define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000001(ptr inreg %sbase) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_neg0x100000001:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_co_u32 v0, s0, 0, s2
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s0
@@ -359,6 +388,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000001(ptr inreg %sbas
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_neg0x100000001:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, -1
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, -2
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -369,6 +399,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000001(ptr inreg %sbas
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_offset_neg0x100000001:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_add_co_u32 v0, s0, 0, s2
 ; GFX1250-NOECC-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NOECC-NEXT:    v_add_co_ci_u32_e64 v1, null, -1, s3, s0
@@ -390,6 +421,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_neg0x100000001(ptr inreg %sbas
 define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -405,6 +437,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr(ptr inreg %sbase, i32 %voff
 define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388607(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388607:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:8388607
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -421,6 +454,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388607(ptr inreg %s
 define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388608:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
@@ -433,6 +467,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %s
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388608:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v2, v0
@@ -446,6 +481,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %s
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388608:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NOECC-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NOECC-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
@@ -468,6 +504,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388608(ptr inreg %s
 define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_neg8388608(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_neg8388608:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-8388608
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -484,6 +521,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_neg8388608(ptr inreg
 define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_neg8388607(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_neg8388607:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-8388607
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -499,6 +537,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_neg8388607(ptr inreg
 define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388607_gep_order(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_offset_8388607_gep_order:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:8388607
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -515,6 +554,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_offset_8388607_gep_order(pt
 define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -532,6 +572,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint(ptr inreg %sbase,
 define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -549,6 +590,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add(ptr in
 define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset0:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -567,6 +609,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_of
 define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_offset1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -591,6 +634,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_vgpr_ptrtoint_commute_add_imm_of
 define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_uniform_ptr_in_vgprs:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v1
 ; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
@@ -602,6 +646,7 @@ define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_uniform_ptr_in_vgprs:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v1
 ; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
@@ -614,6 +659,7 @@ define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_uniform_ptr_in_vgprs:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NOECC-NEXT:    ds_load_b64 v[2:3], v1
 ; GFX1250-NOECC-NEXT:    s_wait_dscnt 0x0
@@ -635,6 +681,7 @@ define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs(i32 %voffset) {
 define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_uniform_ptr_in_vgprs_immoffset:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v1
 ; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
@@ -646,6 +693,7 @@ define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_uniform_ptr_in_vgprs_immoffset:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v1
 ; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
@@ -658,6 +706,7 @@ define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_uniform_ptr_in_vgprs_immoffset:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NOECC-NEXT:    ds_load_b64 v[2:3], v1
 ; GFX1250-NOECC-NEXT:    s_wait_dscnt 0x0
@@ -680,6 +729,7 @@ define amdgpu_ps float @flat_load_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff
 define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase, i32 inreg %soffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_uniform_offset:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -687,6 +737,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase,
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_uniform_offset:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, s4
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 0
@@ -696,6 +747,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase,
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_zext_uniform_offset:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1250-NOECC-NEXT:    flat_load_u8 v0, v0, s[2:3]
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -712,6 +764,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset(ptr inreg %sbase,
 define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inreg %sbase, i32 inreg %soffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-24
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -719,6 +772,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inr
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, s4
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 0
@@ -728,6 +782,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inr
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_zext_uniform_offset_immoffset:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1250-NOECC-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-24
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -745,6 +800,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_uniform_offset_immoffset(ptr inr
 define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr inreg %sbase, i32 inreg %soffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -752,6 +808,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr in
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, s4
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 0
@@ -761,6 +818,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr in
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1250-NOECC-NEXT:    flat_load_u8 v0, v0, s[2:3]
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -779,6 +837,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add(ptr in
 define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0(ptr inreg %sbase, i32 inreg %soffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:128
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -786,6 +845,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_of
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-GISEL-NEXT:    s_add_co_u32 s0, s2, s4
 ; GFX1250-GISEL-NEXT:    s_add_co_ci_u32 s1, s3, 0
@@ -795,6 +855,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_of
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_offset0:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1250-NOECC-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:128
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -814,6 +875,7 @@ define amdgpu_ps float @flat_load_saddr_i8_zext_sgpr_ptrtoint_commute_add_imm_of
 define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32(ptr %vbase, i32 inreg %soffset) {
 ; GFX1250-SDAG-LABEL: flat_load_i8_vgpr64_sgpr32:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s3, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
@@ -823,6 +885,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32(ptr %vbase, i32 inreg %soffse
 ;
 ; GFX1250-GISEL-LABEL: flat_load_i8_vgpr64_sgpr32:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s3, 0
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
@@ -835,6 +898,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32(ptr %vbase, i32 inreg %soffse
 ;
 ; GFX1250-NOECC-LABEL: flat_load_i8_vgpr64_sgpr32:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    s_mov_b32 s3, 0
 ; GFX1250-NOECC-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NOECC-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
@@ -853,6 +917,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32(ptr %vbase, i32 inreg %soffse
 define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i32 inreg %soffset) {
 ; GFX1250-SDAG-LABEL: flat_load_i8_vgpr64_sgpr32_offset_8388607:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s3, 0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
@@ -862,6 +927,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i3
 ;
 ; GFX1250-GISEL-LABEL: flat_load_i8_vgpr64_sgpr32_offset_8388607:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s3, 0
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[2:3]
@@ -874,6 +940,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i3
 ;
 ; GFX1250-NOECC-LABEL: flat_load_i8_vgpr64_sgpr32_offset_8388607:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    s_mov_b32 s3, 0
 ; GFX1250-NOECC-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NOECC-NEXT:    v_add_nc_u64_e32 v[0:1], s[2:3], v[0:1]
@@ -897,6 +964,7 @@ define amdgpu_ps float @flat_load_i8_vgpr64_sgpr32_offset_8388607(ptr %vbase, i3
 define amdgpu_ps float @flat_load_saddr_f32_natural_addressing(ptr inreg %sbase, ptr %voffset.ptr) {
 ; GFX1250-LABEL: flat_load_saddr_f32_natural_addressing:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] scale_offset
@@ -913,6 +981,7 @@ define amdgpu_ps float @flat_load_saddr_f32_natural_addressing(ptr inreg %sbase,
 define amdgpu_ps float @flat_load_saddr_f32_natural_addressing_immoffset(ptr inreg %sbase, ptr %voffset.ptr) {
 ; GFX1250-LABEL: flat_load_saddr_f32_natural_addressing_immoffset:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:128
@@ -930,6 +999,7 @@ define amdgpu_ps float @flat_load_saddr_f32_natural_addressing_immoffset(ptr inr
 define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range(ptr inreg %sbase, ptr %voffset.ptr) {
 ; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] scale_offset
@@ -946,6 +1016,7 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range(ptr inreg %sbase, pt
 define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg %sbase, ptr %voffset.ptr) {
 ; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range_imm_offset:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:400 scale_offset
@@ -963,6 +1034,7 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_imm_offset(ptr inreg
 define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_too_large(ptr inreg %sbase, ptr %voffset.ptr) {
 ; GFX1250-LABEL: flat_load_f32_saddr_zext_vgpr_range_too_large:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] scale_offset
@@ -982,24 +1054,28 @@ define amdgpu_ps float @flat_load_f32_saddr_zext_vgpr_range_too_large(ptr inreg
 define amdgpu_ps half @flat_load_saddr_i16(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-NOECC-SDAG-TRUE16-LABEL: flat_load_saddr_i16:
 ; GFX1250-NOECC-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-NOECC-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    flat_load_d16_b16 v0, v0, s[2:3]
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-NOECC-SDAG-FAKE16-LABEL: flat_load_saddr_i16:
 ; GFX1250-NOECC-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-NOECC-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    ; return to shader part epilog
@@ -1013,24 +1089,28 @@ define amdgpu_ps half @flat_load_saddr_i16(ptr inreg %sbase, i32 %voffset) {
 define amdgpu_ps half @flat_load_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_immneg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_immneg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-NOECC-SDAG-TRUE16-LABEL: flat_load_saddr_i16_immneg128:
 ; GFX1250-NOECC-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-NOECC-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    flat_load_d16_b16 v0, v0, s[2:3] offset:-128
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-NOECC-SDAG-FAKE16-LABEL: flat_load_saddr_i16_immneg128:
 ; GFX1250-NOECC-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-NOECC-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    ; return to shader part epilog
@@ -1045,24 +1125,28 @@ define amdgpu_ps half @flat_load_saddr_i16_immneg128(ptr inreg %sbase, i32 %voff
 define amdgpu_ps half @flat_load_saddr_f16(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_f16:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_f16:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-NOECC-SDAG-TRUE16-LABEL: flat_load_saddr_f16:
 ; GFX1250-NOECC-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-NOECC-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    flat_load_d16_b16 v0, v0, s[2:3]
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-NOECC-SDAG-FAKE16-LABEL: flat_load_saddr_f16:
 ; GFX1250-NOECC-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-NOECC-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    ; return to shader part epilog
@@ -1075,24 +1159,28 @@ define amdgpu_ps half @flat_load_saddr_f16(ptr inreg %sbase, i32 %voffset) {
 define amdgpu_ps half @flat_load_saddr_f16_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_f16_immneg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_f16_immneg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-NOECC-SDAG-TRUE16-LABEL: flat_load_saddr_f16_immneg128:
 ; GFX1250-NOECC-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-NOECC-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    flat_load_d16_b16 v0, v0, s[2:3] offset:-128
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-NOECC-SDAG-FAKE16-LABEL: flat_load_saddr_f16_immneg128:
 ; GFX1250-NOECC-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-NOECC-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    ; return to shader part epilog
@@ -1106,6 +1194,7 @@ define amdgpu_ps half @flat_load_saddr_f16_immneg128(ptr inreg %sbase, i32 %voff
 define amdgpu_ps float @flat_load_saddr_i32(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1119,6 +1208,7 @@ define amdgpu_ps float @flat_load_saddr_i32(ptr inreg %sbase, i32 %voffset) {
 define amdgpu_ps float @flat_load_saddr_i32_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_i32_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1133,6 +1223,7 @@ define amdgpu_ps float @flat_load_saddr_i32_immneg128(ptr inreg %sbase, i32 %vof
 define amdgpu_ps float @flat_load_saddr_f32(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1145,6 +1236,7 @@ define amdgpu_ps float @flat_load_saddr_f32(ptr inreg %sbase, i32 %voffset) {
 define amdgpu_ps float @flat_load_saddr_f32_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_f32_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1158,6 +1250,7 @@ define amdgpu_ps float @flat_load_saddr_f32_immneg128(ptr inreg %sbase, i32 %vof
 define amdgpu_ps <2 x half> @flat_load_saddr_v2i16(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v2i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1171,6 +1264,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_v2i16(ptr inreg %sbase, i32 %voffse
 define amdgpu_ps <2 x half> @flat_load_saddr_v2i16_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v2i16_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1185,6 +1279,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_v2i16_immneg128(ptr inreg %sbase, i
 define amdgpu_ps <2 x half> @flat_load_saddr_v2f16(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v2f16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1197,6 +1292,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_v2f16(ptr inreg %sbase, i32 %voffse
 define amdgpu_ps <2 x half> @flat_load_saddr_v2f16_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v2f16_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1210,6 +1306,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_v2f16_immneg128(ptr inreg %sbase, i
 define amdgpu_ps <2 x half> @flat_load_saddr_p3(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_p3:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1224,6 +1321,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_p3(ptr inreg %sbase, i32 %voffset)
 define amdgpu_ps <2 x half> @flat_load_saddr_p3_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_p3_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1239,6 +1337,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_p3_immneg128(ptr inreg %sbase, i32
 define amdgpu_ps <2 x float> @flat_load_saddr_f64(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_f64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1252,6 +1351,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_f64(ptr inreg %sbase, i32 %voffset
 define amdgpu_ps <2 x float> @flat_load_saddr_f64_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_f64_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1266,6 +1366,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_f64_immneg128(ptr inreg %sbase, i3
 define amdgpu_ps <2 x float> @flat_load_saddr_i64(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1279,6 +1380,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_i64(ptr inreg %sbase, i32 %voffset
 define amdgpu_ps <2 x float> @flat_load_saddr_i64_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_i64_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1293,6 +1395,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_i64_immneg128(ptr inreg %sbase, i3
 define amdgpu_ps <2 x float> @flat_load_saddr_v2f32(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v2f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1305,6 +1408,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v2f32(ptr inreg %sbase, i32 %voffs
 define amdgpu_ps <2 x float> @flat_load_saddr_v2f32_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v2f32_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1318,6 +1422,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v2f32_immneg128(ptr inreg %sbase,
 define amdgpu_ps <2 x float> @flat_load_saddr_v2i32(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v2i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1331,6 +1436,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v2i32(ptr inreg %sbase, i32 %voffs
 define amdgpu_ps <2 x float> @flat_load_saddr_v2i32_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v2i32_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1345,6 +1451,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v2i32_immneg128(ptr inreg %sbase,
 define amdgpu_ps <2 x float> @flat_load_saddr_v4i16(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v4i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1358,6 +1465,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v4i16(ptr inreg %sbase, i32 %voffs
 define amdgpu_ps <2 x float> @flat_load_saddr_v4i16_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v4i16_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1372,6 +1480,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v4i16_immneg128(ptr inreg %sbase,
 define amdgpu_ps <2 x float> @flat_load_saddr_v4f16(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v4f16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1385,6 +1494,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v4f16(ptr inreg %sbase, i32 %voffs
 define amdgpu_ps <2 x float> @flat_load_saddr_v4f16_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v4f16_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1399,6 +1509,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_v4f16_immneg128(ptr inreg %sbase,
 define amdgpu_ps <2 x float> @flat_load_saddr_p1(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_p1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1413,6 +1524,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_p1(ptr inreg %sbase, i32 %voffset)
 define amdgpu_ps <2 x float> @flat_load_saddr_p1_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_p1_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1428,6 +1540,7 @@ define amdgpu_ps <2 x float> @flat_load_saddr_p1_immneg128(ptr inreg %sbase, i32
 define amdgpu_ps <3 x float> @flat_load_saddr_v3f32(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v3f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1440,6 +1553,7 @@ define amdgpu_ps <3 x float> @flat_load_saddr_v3f32(ptr inreg %sbase, i32 %voffs
 define amdgpu_ps <3 x float> @flat_load_saddr_v3f32_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v3f32_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1453,6 +1567,7 @@ define amdgpu_ps <3 x float> @flat_load_saddr_v3f32_immneg128(ptr inreg %sbase,
 define amdgpu_ps <3 x float> @flat_load_saddr_v3i32(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v3i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1466,6 +1581,7 @@ define amdgpu_ps <3 x float> @flat_load_saddr_v3i32(ptr inreg %sbase, i32 %voffs
 define amdgpu_ps <3 x float> @flat_load_saddr_v3i32_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v3i32_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1480,6 +1596,7 @@ define amdgpu_ps <3 x float> @flat_load_saddr_v3i32_immneg128(ptr inreg %sbase,
 define amdgpu_ps <6 x half> @flat_load_saddr_v6f16(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v6f16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1492,6 +1609,7 @@ define amdgpu_ps <6 x half> @flat_load_saddr_v6f16(ptr inreg %sbase, i32 %voffse
 define amdgpu_ps <6 x half> @flat_load_saddr_v6f16_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v6f16_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b96 v[0:2], v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1505,6 +1623,7 @@ define amdgpu_ps <6 x half> @flat_load_saddr_v6f16_immneg128(ptr inreg %sbase, i
 define amdgpu_ps <4 x float> @flat_load_saddr_v4f32(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v4f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1517,6 +1636,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v4f32(ptr inreg %sbase, i32 %voffs
 define amdgpu_ps <4 x float> @flat_load_saddr_v4f32_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v4f32_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1530,6 +1650,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v4f32_immneg128(ptr inreg %sbase,
 define amdgpu_ps <4 x float> @flat_load_saddr_v4i32(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v4i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1543,6 +1664,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v4i32(ptr inreg %sbase, i32 %voffs
 define amdgpu_ps <4 x float> @flat_load_saddr_v4i32_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v4i32_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1557,6 +1679,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v4i32_immneg128(ptr inreg %sbase,
 define amdgpu_ps <4 x float> @flat_load_saddr_v2i64(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v2i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1570,6 +1693,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v2i64(ptr inreg %sbase, i32 %voffs
 define amdgpu_ps <4 x float> @flat_load_saddr_v2i64_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v2i64_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1584,6 +1708,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v2i64_immneg128(ptr inreg %sbase,
 define amdgpu_ps <4 x float> @flat_load_saddr_i128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_i128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1597,6 +1722,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_i128(ptr inreg %sbase, i32 %voffse
 define amdgpu_ps <4 x float> @flat_load_saddr_i128_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_i128_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1611,6 +1737,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_i128_immneg128(ptr inreg %sbase, i
 define amdgpu_ps <4 x float> @flat_load_saddr_v2p1(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v2p1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1625,6 +1752,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v2p1(ptr inreg %sbase, i32 %voffse
 define amdgpu_ps <4 x float> @flat_load_saddr_v2p1_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v2p1_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1640,6 +1768,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v2p1_immneg128(ptr inreg %sbase, i
 define amdgpu_ps <4 x float> @flat_load_saddr_v4p3(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v4p3:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1654,6 +1783,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v4p3(ptr inreg %sbase, i32 %voffse
 define amdgpu_ps <4 x float> @flat_load_saddr_v4p3_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_load_saddr_v4p3_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b128 v[0:3], v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1673,6 +1803,7 @@ define amdgpu_ps <4 x float> @flat_load_saddr_v4p3_immneg128(ptr inreg %sbase, i
 define amdgpu_ps float @flat_sextload_saddr_i8(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_sextload_saddr_i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_i8 v0, v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1687,6 +1818,7 @@ define amdgpu_ps float @flat_sextload_saddr_i8(ptr inreg %sbase, i32 %voffset) {
 define amdgpu_ps float @flat_sextload_saddr_i8_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_sextload_saddr_i8_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_i8 v0, v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1702,6 +1834,7 @@ define amdgpu_ps float @flat_sextload_saddr_i8_immneg128(ptr inreg %sbase, i32 %
 define amdgpu_ps float @flat_sextload_saddr_i16(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_sextload_saddr_i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_i16 v0, v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1716,6 +1849,7 @@ define amdgpu_ps float @flat_sextload_saddr_i16(ptr inreg %sbase, i32 %voffset)
 define amdgpu_ps float @flat_sextload_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_sextload_saddr_i16_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_i16 v0, v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1731,6 +1865,7 @@ define amdgpu_ps float @flat_sextload_saddr_i16_immneg128(ptr inreg %sbase, i32
 define amdgpu_ps float @flat_zextload_saddr_i8(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_zextload_saddr_i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1745,6 +1880,7 @@ define amdgpu_ps float @flat_zextload_saddr_i8(ptr inreg %sbase, i32 %voffset) {
 define amdgpu_ps float @flat_zextload_saddr_i8_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_zextload_saddr_i8_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1760,6 +1896,7 @@ define amdgpu_ps float @flat_zextload_saddr_i8_immneg128(ptr inreg %sbase, i32 %
 define amdgpu_ps float @flat_zextload_saddr_i16(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_zextload_saddr_i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1774,6 +1911,7 @@ define amdgpu_ps float @flat_zextload_saddr_i16(ptr inreg %sbase, i32 %voffset)
 define amdgpu_ps float @flat_zextload_saddr_i16_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: flat_zextload_saddr_i16_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    ; return to shader part epilog
@@ -1793,6 +1931,7 @@ define amdgpu_ps float @flat_zextload_saddr_i16_immneg128(ptr inreg %sbase, i32
 define amdgpu_ps float @atomic_flat_load_saddr_i32(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: atomic_flat_load_saddr_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
@@ -1808,6 +1947,7 @@ define amdgpu_ps float @atomic_flat_load_saddr_i32(ptr inreg %sbase, i32 %voffse
 define amdgpu_ps float @atomic_flat_load_saddr_i32_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: atomic_flat_load_saddr_i32_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v0, s[2:3] offset:-128 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
@@ -1824,6 +1964,7 @@ define amdgpu_ps float @atomic_flat_load_saddr_i32_immneg128(ptr inreg %sbase, i
 define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: atomic_flat_load_saddr_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
@@ -1839,6 +1980,7 @@ define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64(ptr inreg %sbase, i32 %
 define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-LABEL: atomic_flat_load_saddr_i64_immneg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b64 v[0:1], v0, s[2:3] offset:-128 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
@@ -1859,18 +2001,21 @@ define amdgpu_ps <2 x float> @atomic_flat_load_saddr_i64_immneg128(ptr inreg %sb
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_undef_hi:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_undef_hi:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_undef_hi:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    flat_load_d16_b16 v0, v0, s[2:3]
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-NEXT:    ; return to shader part epilog
@@ -1885,18 +2030,21 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi(ptr inreg %sbase
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_undef_hi_immneg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_undef_hi_immneg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_undef_hi_immneg128:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    flat_load_d16_b16 v0, v0, s[2:3] offset:-128
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-NEXT:    ; return to shader part epilog
@@ -1912,6 +2060,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_undef_hi_immneg128(ptr in
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zero_hi:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -1919,6 +2068,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi(ptr inreg %sbase,
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zero_hi:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -1926,6 +2076,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi(ptr inreg %sbase,
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_zero_hi:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NOECC-NEXT:    flat_load_d16_b16 v1, v0, s[2:3]
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1942,6 +2093,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi(ptr inreg %sbase,
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -1949,6 +2101,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi_immneg128(ptr inr
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -1956,6 +2109,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi_immneg128(ptr inr
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_zero_hi_immneg128:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NOECC-NEXT:    flat_load_d16_b16 v1, v0, s[2:3] offset:-128
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -1973,6 +2127,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zero_hi_immneg128(ptr inr
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_reg_hi:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
@@ -1980,6 +2135,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi(ptr inreg %sbase,
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_reg_hi:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
@@ -1987,6 +2143,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi(ptr inreg %sbase,
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_reg_hi:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    flat_load_d16_b16 v1, v0, s[2:3]
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, v1
@@ -2002,6 +2159,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi(ptr inreg %sbase,
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
@@ -2009,6 +2167,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi_immneg128(ptr inre
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
@@ -2016,6 +2175,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi_immneg128(ptr inre
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_reg_hi_immneg128:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    flat_load_d16_b16 v1, v0, s[2:3] offset:-128
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, v1
@@ -2032,6 +2192,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_reg_hi_immneg128(ptr inre
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
@@ -2039,6 +2200,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi(ptr inreg %
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v0, s[2:3]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
@@ -2046,6 +2208,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi(ptr inreg %
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    flat_load_d16_u8 v1, v0, s[2:3]
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, v1
@@ -2062,6 +2225,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi(ptr inreg %
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
@@ -2069,6 +2233,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(p
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_and_or_b32 v0, 0xffff0000, v1, v0
@@ -2076,6 +2241,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(p
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    flat_load_d16_u8 v1, v0, s[2:3] offset:-128
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, v1
@@ -2093,6 +2259,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_zexti8_reg_hi_immneg128(p
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_i8 v0, v0, s[2:3]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
@@ -2100,6 +2267,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg %
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_i8 v0, v0, s[2:3]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -2109,6 +2277,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg %
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    flat_load_d16_i8 v1, v0, s[2:3]
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, v1
@@ -2125,6 +2294,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi(ptr inreg %
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_i8 v0, v0, s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_bfi_b32 v0, 0xffff, v0, v1
@@ -2132,6 +2302,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(p
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_i8 v0, v0, s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -2141,6 +2312,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(p
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    flat_load_d16_i8 v1, v0, s[2:3] offset:-128
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, v1
@@ -2162,6 +2334,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16lo_sexti8_reg_hi_immneg128(p
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_undef_hi:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -2169,6 +2342,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi(ptr inreg %sbase
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_undef_hi:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -2176,6 +2350,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi(ptr inreg %sbase
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_undef_hi:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    flat_load_d16_hi_b16 v0, v0, s[2:3]
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-NEXT:    ; return to shader part epilog
@@ -2190,6 +2365,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi(ptr inreg %sbase
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -2197,6 +2373,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi_immneg128(ptr in
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -2204,6 +2381,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi_immneg128(ptr in
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_undef_hi_immneg128:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    flat_load_d16_hi_b16 v0, v0, s[2:3] offset:-128
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-NEXT:    ; return to shader part epilog
@@ -2219,6 +2397,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_undef_hi_immneg128(ptr in
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zero_hi:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, 0, 0x5040100
@@ -2226,6 +2405,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase,
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zero_hi:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -2233,6 +2413,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase,
 ;
 ; GFX1250-NOECC-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi:
 ; GFX1250-NOECC-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-NOECC-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    flat_load_d16_hi_b16 v1, v0, s[2:3]
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2241,6 +2422,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase,
 ;
 ; GFX1250-NOECC-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi:
 ; GFX1250-NOECC-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-NOECC-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    flat_load_d16_hi_b16 v1, v0, s[2:3]
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2257,6 +2439,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi(ptr inreg %sbase,
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inreg %sbase, i32 %voffset) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, 0, 0x5040100
@@ -2264,6 +2447,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inr
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -2271,6 +2455,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inr
 ;
 ; GFX1250-NOECC-SDAG-TRUE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128:
 ; GFX1250-NOECC-SDAG-TRUE16:       ; %bb.0:
+; GFX1250-NOECC-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    flat_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
 ; GFX1250-NOECC-SDAG-TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2279,6 +2464,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inr
 ;
 ; GFX1250-NOECC-SDAG-FAKE16-LABEL: flat_load_saddr_i16_d16hi_zero_hi_immneg128:
 ; GFX1250-NOECC-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-NOECC-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    flat_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
 ; GFX1250-NOECC-SDAG-FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2296,6 +2482,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zero_hi_immneg128(ptr inr
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_reg_hi:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
@@ -2303,6 +2490,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi(ptr inreg %sbase,
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_reg_hi:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -2312,6 +2500,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi(ptr inreg %sbase,
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_reg_hi:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    flat_load_d16_hi_b16 v1, v0, s[2:3]
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, v1
@@ -2327,6 +2516,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi(ptr inreg %sbase,
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
@@ -2334,6 +2524,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi_immneg128(ptr inre
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u16 v0, v0, s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -2343,6 +2534,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi_immneg128(ptr inre
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_reg_hi_immneg128:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    flat_load_d16_hi_b16 v1, v0, s[2:3] offset:-128
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, v1
@@ -2359,6 +2551,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_reg_hi_immneg128(ptr inre
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
@@ -2366,6 +2559,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi(ptr inreg %
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v0, s[2:3]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -2375,6 +2569,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi(ptr inreg %
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    flat_load_d16_hi_u8 v1, v0, s[2:3]
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, v1
@@ -2391,6 +2586,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi(ptr inreg %
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
@@ -2398,6 +2594,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(p
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v0, s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
@@ -2407,6 +2604,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(p
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    flat_load_d16_hi_u8 v1, v0, s[2:3] offset:-128
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, v1
@@ -2424,6 +2622,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_zexti8_reg_hi_immneg128(p
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_i8 v0, v0, s[2:3]
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
@@ -2431,6 +2630,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg %
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_i8 v0, v0, s[2:3]
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -2441,6 +2641,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg %
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    flat_load_d16_hi_i8 v1, v0, s[2:3]
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, v1
@@ -2457,6 +2658,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi(ptr inreg %
 define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %reg) {
 ; GFX1250-SDAG-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    flat_load_i8 v0, v0, s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-SDAG-NEXT:    v_perm_b32 v0, v0, v1, 0x5040100
@@ -2464,6 +2666,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(p
 ;
 ; GFX1250-GISEL-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    flat_load_i8 v0, v0, s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -2474,6 +2677,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(p
 ;
 ; GFX1250-NOECC-LABEL: flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128:
 ; GFX1250-NOECC:       ; %bb.0:
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    flat_load_d16_hi_i8 v1, v0, s[2:3] offset:-128
 ; GFX1250-NOECC-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, v1
@@ -2496,6 +2700,7 @@ define amdgpu_ps <2 x half> @flat_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128(p
 define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrspace(6) inreg %sbase, i32 %idx) {
 ; GFX1250-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_bitop2_b32 v0, 16, v0 bitop3:0x54
 ; GFX1250-NEXT:    flat_load_u8 v0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2512,6 +2717,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrs
 define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) {
 ; GFX1250-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_or_b32_e32 v0, 0x1040, v0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    flat_load_u8 v0, v[0:1]
@@ -2533,6 +2739,7 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr add
 define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
 ; GFX1250-SDAG-LABEL: flat_addr_64bit_lsr_iv:
 ; GFX1250-SDAG:       ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1250-SDAG-NEXT:  .LBB116_1: ; %bb3
@@ -2550,6 +2757,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
 ;
 ; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv:
 ; GFX1250-GISEL:       ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
@@ -2569,6 +2777,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv(ptr inreg %arg) {
 ;
 ; GFX1250-NOECC-LABEL: flat_addr_64bit_lsr_iv:
 ; GFX1250-NOECC:       ; %bb.0: ; %bb
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NOECC-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1250-NOECC-NEXT:  .LBB116_1: ; %bb3
@@ -2604,6 +2813,7 @@ bb3:                                              ; preds = %bb3, %bb
 define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inreg %arg.1) {
 ; GFX1250-SDAG-LABEL: flat_addr_64bit_lsr_iv_multiload:
 ; GFX1250-SDAG:       ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1250-SDAG-NEXT:  .LBB117_1: ; %bb3
@@ -2624,6 +2834,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
 ;
 ; GFX1250-GISEL-LABEL: flat_addr_64bit_lsr_iv_multiload:
 ; GFX1250-GISEL:       ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[2:3]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
@@ -2646,6 +2857,7 @@ define amdgpu_ps void @flat_addr_64bit_lsr_iv_multiload(ptr inreg %arg, ptr inre
 ;
 ; GFX1250-NOECC-LABEL: flat_addr_64bit_lsr_iv_multiload:
 ; GFX1250-NOECC:       ; %bb.0: ; %bb
+; GFX1250-NOECC-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NOECC-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NOECC-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX1250-NOECC-NEXT:  .LBB117_1: ; %bb3

diff  --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
index 3d0e2875e91a2..78ed084493c4b 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-store.ll
@@ -7,6 +7,7 @@
 define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) {
 ; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    flat_store_b8 v0, v2, s[2:3]
@@ -22,6 +23,7 @@ define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr(ptr inreg %sbase, ptr %voff
 define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_2047(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) {
 ; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr_offset_2047:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    flat_store_b8 v0, v2, s[2:3] offset:2047
@@ -38,6 +40,7 @@ define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_2047(ptr inreg %sbas
 define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_neg2048(ptr inreg %sbase, ptr %voffset.ptr, i8 %data) {
 ; GFX1250-LABEL: flat_store_saddr_i8_zext_vgpr_offset_neg2048:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_b32 v0, v[0:1]
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    flat_store_b8 v0, v2, s[2:3] offset:-2048
@@ -60,6 +63,7 @@ define amdgpu_ps void @flat_store_saddr_i8_zext_vgpr_offset_neg2048(ptr inreg %s
 define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_uniform_ptr_in_vgprs:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v2
 ; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
@@ -70,6 +74,7 @@ define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %d
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_uniform_ptr_in_vgprs:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v2
 ; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
@@ -89,6 +94,7 @@ define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs(i32 %voffset, i8 %d
 define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voffset, i8 %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_uniform_ptr_in_vgprs_immoffset:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-SDAG-NEXT:    ds_load_b64 v[2:3], v2
 ; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
@@ -99,6 +105,7 @@ define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_uniform_ptr_in_vgprs_immoffset:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-GISEL-NEXT:    ds_load_b64 v[2:3], v2
 ; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
@@ -122,6 +129,7 @@ define amdgpu_ps void @flat_store_saddr_uniform_ptr_in_vgprs_immoffset(i32 %voff
 define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, i16 %data) {
 ; GFX1250-LABEL: flat_store_saddr_i16_zext_vgpr:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_store_b16 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -133,6 +141,7 @@ define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr(ptr inreg %sbase, i32 %vof
 define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i16 %data) {
 ; GFX1250-LABEL: flat_store_saddr_i16_zext_vgpr_offset_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_store_b16 v0, v1, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -145,6 +154,7 @@ define amdgpu_ps void @flat_store_saddr_i16_zext_vgpr_offset_neg128(ptr inreg %s
 define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, half %data) {
 ; GFX1250-LABEL: flat_store_saddr_f16_zext_vgpr:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_store_b16 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -156,6 +166,7 @@ define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr(ptr inreg %sbase, i32 %vof
 define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, half %data) {
 ; GFX1250-LABEL: flat_store_saddr_f16_zext_vgpr_offset_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_store_b16 v0, v1, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -168,6 +179,7 @@ define amdgpu_ps void @flat_store_saddr_f16_zext_vgpr_offset_neg128(ptr inreg %s
 define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_store_saddr_i32_zext_vgpr:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -179,6 +191,7 @@ define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i32 %vof
 define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: flat_store_saddr_i32_zext_vgpr_offset_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -191,6 +204,7 @@ define amdgpu_ps void @flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr inreg %s
 define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, float %data) {
 ; GFX1250-LABEL: flat_store_saddr_f32_zext_vgpr:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -202,6 +216,7 @@ define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr(ptr inreg %sbase, i32 %vof
 define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, float %data) {
 ; GFX1250-LABEL: flat_store_saddr_f32_zext_vgpr_offset_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -214,6 +229,7 @@ define amdgpu_ps void @flat_store_saddr_f32_zext_vgpr_offset_neg128(ptr inreg %s
 define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr(ptr inreg %sbase, i32 %voffset, ptr addrspace(3) %data) {
 ; GFX1250-LABEL: flat_store_saddr_p3_zext_vgpr:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -225,6 +241,7 @@ define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr(ptr inreg %sbase, i32 %voff
 define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, ptr addrspace(3) %data) {
 ; GFX1250-LABEL: flat_store_saddr_p3_zext_vgpr_offset_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -237,12 +254,14 @@ define amdgpu_ps void @flat_store_saddr_p3_zext_vgpr_offset_neg128(ptr inreg %sb
 define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_i64_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_i64_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -255,12 +274,14 @@ define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i32 %vof
 define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_i64_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_i64_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -274,12 +295,14 @@ define amdgpu_ps void @flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr inreg %s
 define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr(ptr inreg %sbase, i32 %voffset, double %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_f64_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_f64_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -292,12 +315,14 @@ define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr(ptr inreg %sbase, i32 %vof
 define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, double %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_f64_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_f64_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -311,12 +336,14 @@ define amdgpu_ps void @flat_store_saddr_f64_zext_vgpr_offset_neg128(ptr inreg %s
 define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i32> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v2i32_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v2i32_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -329,12 +356,14 @@ define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr(ptr inreg %sbase, i32 %v
 define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i32> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v2i32_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v2i32_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -348,12 +377,14 @@ define amdgpu_ps void @flat_store_saddr_v2i32_zext_vgpr_offset_neg128(ptr inreg
 define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x float> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v2f32_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v2f32_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -366,12 +397,14 @@ define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr(ptr inreg %sbase, i32 %v
 define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x float> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v2f32_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v2f32_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -385,12 +418,14 @@ define amdgpu_ps void @flat_store_saddr_v2f32_zext_vgpr_offset_neg128(ptr inreg
 define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x i16> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v4i16_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v4i16_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -403,12 +438,14 @@ define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr(ptr inreg %sbase, i32 %v
 define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x i16> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v4i16_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v4i16_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -422,12 +459,14 @@ define amdgpu_ps void @flat_store_saddr_v4i16_zext_vgpr_offset_neg128(ptr inreg
 define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x half> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v4f16_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v4f16_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -440,12 +479,14 @@ define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr(ptr inreg %sbase, i32 %v
 define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x half> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v4f16_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v4f16_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -459,12 +500,14 @@ define amdgpu_ps void @flat_store_saddr_v4f16_zext_vgpr_offset_neg128(ptr inreg
 define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr(ptr inreg %sbase, i32 %voffset, ptr %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_p1_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3]
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_p1_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -477,12 +520,14 @@ define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr(ptr inreg %sbase, i32 %voff
 define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, ptr %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_p1_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b64 v0, v[2:3], s[2:3] offset:-128
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_p1_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    flat_store_b64 v0, v[4:5], s[2:3] offset:-128
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -496,6 +541,7 @@ define amdgpu_ps void @flat_store_saddr_p1_zext_vgpr_offset_neg128(ptr inreg %sb
 define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <3 x i32> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v3i32_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3]
@@ -503,6 +549,7 @@ define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v3i32_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3]
@@ -516,6 +563,7 @@ define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr(ptr inreg %sbase, i32 %v
 define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <3 x i32> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v3i32_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3] offset:-128
@@ -523,6 +571,7 @@ define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr_offset_neg128(ptr inreg
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v3i32_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3] offset:-128
@@ -537,6 +586,7 @@ define amdgpu_ps void @flat_store_saddr_v3i32_zext_vgpr_offset_neg128(ptr inreg
 define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <3 x float> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v3f32_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3]
@@ -544,6 +594,7 @@ define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v3f32_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3]
@@ -557,6 +608,7 @@ define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr(ptr inreg %sbase, i32 %v
 define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <3 x float> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v3f32_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3] offset:-128
@@ -564,6 +616,7 @@ define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr_offset_neg128(ptr inreg
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v3f32_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3] offset:-128
@@ -578,6 +631,7 @@ define amdgpu_ps void @flat_store_saddr_v3f32_zext_vgpr_offset_neg128(ptr inreg
 define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <6 x i16> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v6i16_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3]
@@ -585,6 +639,7 @@ define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v6i16_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3]
@@ -598,6 +653,7 @@ define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr(ptr inreg %sbase, i32 %v
 define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <6 x i16> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v6i16_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3] offset:-128
@@ -605,6 +661,7 @@ define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr_offset_neg128(ptr inreg
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v6i16_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3] offset:-128
@@ -619,6 +676,7 @@ define amdgpu_ps void @flat_store_saddr_v6i16_zext_vgpr_offset_neg128(ptr inreg
 define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <6 x half> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v6f16_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3]
@@ -626,6 +684,7 @@ define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v6f16_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3]
@@ -639,6 +698,7 @@ define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr(ptr inreg %sbase, i32 %v
 define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <6 x half> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v6f16_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, v3 :: v_dual_mov_b32 v3, v2
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b96 v0, v[2:4], s[2:3] offset:-128
@@ -646,6 +706,7 @@ define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr_offset_neg128(ptr inreg
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v6f16_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v6, v3
 ; GFX1250-GISEL-NEXT:    flat_store_b96 v0, v[4:6], s[2:3] offset:-128
@@ -660,6 +721,7 @@ define amdgpu_ps void @flat_store_saddr_v6f16_zext_vgpr_offset_neg128(ptr inreg
 define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x i32> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v4i32_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
@@ -667,6 +729,7 @@ define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v4i32_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
@@ -680,6 +743,7 @@ define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr(ptr inreg %sbase, i32 %v
 define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x i32> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v4i32_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
@@ -687,6 +751,7 @@ define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr inreg
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v4i32_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
@@ -701,6 +766,7 @@ define amdgpu_ps void @flat_store_saddr_v4i32_zext_vgpr_offset_neg128(ptr inreg
 define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x float> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v4f32_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
@@ -708,6 +774,7 @@ define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v4f32_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
@@ -721,6 +788,7 @@ define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr(ptr inreg %sbase, i32 %v
 define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x float> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v4f32_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
@@ -728,6 +796,7 @@ define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr_offset_neg128(ptr inreg
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v4f32_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
@@ -742,6 +811,7 @@ define amdgpu_ps void @flat_store_saddr_v4f32_zext_vgpr_offset_neg128(ptr inreg
 define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i64> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v2i64_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
@@ -749,6 +819,7 @@ define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v2i64_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
@@ -762,6 +833,7 @@ define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr(ptr inreg %sbase, i32 %v
 define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i64> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v2i64_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
@@ -769,6 +841,7 @@ define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr_offset_neg128(ptr inreg
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v2i64_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
@@ -783,6 +856,7 @@ define amdgpu_ps void @flat_store_saddr_v2i64_zext_vgpr_offset_neg128(ptr inreg
 define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x double> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v2f64_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
@@ -790,6 +864,7 @@ define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v2f64_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
@@ -803,6 +878,7 @@ define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr(ptr inreg %sbase, i32 %v
 define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x double> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v2f64_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
@@ -810,6 +886,7 @@ define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr_offset_neg128(ptr inreg
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v2f64_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
@@ -824,6 +901,7 @@ define amdgpu_ps void @flat_store_saddr_v2f64_zext_vgpr_offset_neg128(ptr inreg
 define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <8 x i16> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v8i16_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
@@ -831,6 +909,7 @@ define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v8i16_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
@@ -844,6 +923,7 @@ define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr(ptr inreg %sbase, i32 %v
 define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <8 x i16> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v8i16_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
@@ -851,6 +931,7 @@ define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr_offset_neg128(ptr inreg
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v8i16_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
@@ -865,6 +946,7 @@ define amdgpu_ps void @flat_store_saddr_v8i16_zext_vgpr_offset_neg128(ptr inreg
 define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr(ptr inreg %sbase, i32 %voffset, <8 x half> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v8f16_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
@@ -872,6 +954,7 @@ define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr(ptr inreg %sbase, i32 %v
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v8f16_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
@@ -885,6 +968,7 @@ define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr(ptr inreg %sbase, i32 %v
 define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <8 x half> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v8f16_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
@@ -892,6 +976,7 @@ define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr_offset_neg128(ptr inreg
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v8f16_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
@@ -906,6 +991,7 @@ define amdgpu_ps void @flat_store_saddr_v8f16_zext_vgpr_offset_neg128(ptr inreg
 define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x ptr> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v2p1_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
@@ -913,6 +999,7 @@ define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v2p1_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
@@ -926,6 +1013,7 @@ define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr(ptr inreg %sbase, i32 %vo
 define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x ptr> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v2p1_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
@@ -933,6 +1021,7 @@ define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr_offset_neg128(ptr inreg %
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v2p1_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
@@ -947,6 +1036,7 @@ define amdgpu_ps void @flat_store_saddr_v2p1_zext_vgpr_offset_neg128(ptr inreg %
 define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr(ptr inreg %sbase, i32 %voffset, <4 x ptr addrspace(3)> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v4p3_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3]
@@ -954,6 +1044,7 @@ define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr(ptr inreg %sbase, i32 %vo
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v4p3_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3]
@@ -967,6 +1058,7 @@ define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr(ptr inreg %sbase, i32 %vo
 define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <4 x ptr addrspace(3)> %data) {
 ; GFX1250-SDAG-LABEL: flat_store_saddr_v4p3_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    flat_store_b128 v0, v[2:5], s[2:3] offset:-128
@@ -974,6 +1066,7 @@ define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr_offset_neg128(ptr inreg %
 ;
 ; GFX1250-GISEL-LABEL: flat_store_saddr_v4p3_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v2
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    flat_store_b128 v0, v[6:9], s[2:3] offset:-128
@@ -992,6 +1085,7 @@ define amdgpu_ps void @flat_store_saddr_v4p3_zext_vgpr_offset_neg128(ptr inreg %
 define amdgpu_ps void @atomic_flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: atomic_flat_store_saddr_i32_zext_vgpr:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3] scope:SCOPE_SYS
@@ -1005,6 +1099,7 @@ define amdgpu_ps void @atomic_flat_store_saddr_i32_zext_vgpr(ptr inreg %sbase, i
 define amdgpu_ps void @atomic_flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i32 %data) {
 ; GFX1250-LABEL: atomic_flat_store_saddr_i32_zext_vgpr_offset_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    flat_store_b32 v0, v1, s[2:3] offset:-128 scope:SCOPE_SYS
@@ -1019,6 +1114,7 @@ define amdgpu_ps void @atomic_flat_store_saddr_i32_zext_vgpr_offset_neg128(ptr i
 define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: atomic_flat_store_saddr_i64_zext_vgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
@@ -1027,6 +1123,7 @@ define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i
 ;
 ; GFX1250-GISEL-LABEL: atomic_flat_store_saddr_i64_zext_vgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-GISEL-NEXT:    s_wait_storecnt 0x0
@@ -1041,6 +1138,7 @@ define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr(ptr inreg %sbase, i
 define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, i64 %data) {
 ; GFX1250-SDAG-LABEL: atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
@@ -1049,6 +1147,7 @@ define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr i
 ;
 ; GFX1250-GISEL-LABEL: atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-GISEL-NEXT:    s_wait_storecnt 0x0
@@ -1068,6 +1167,7 @@ define amdgpu_ps void @atomic_flat_store_saddr_i64_zext_vgpr_offset_neg128(ptr i
 define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) {
 ; GFX1250-LABEL: flat_store_saddr_i16_d16hi_zext_vgpr:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_store_d16_hi_b16 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -1080,6 +1180,7 @@ define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr(ptr inreg %sbase, i3
 define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) {
 ; GFX1250-LABEL: flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_store_d16_hi_b16 v0, v1, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -1093,6 +1194,7 @@ define amdgpu_ps void @flat_store_saddr_i16_d16hi_zext_vgpr_offset_neg128(ptr in
 define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) {
 ; GFX1250-LABEL: flat_store_saddr_i16_d16hi_trunci8_zext_vgpr:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_store_d16_hi_b8 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64
@@ -1106,6 +1208,7 @@ define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr(ptr inreg %s
 define amdgpu_ps void @flat_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128(ptr inreg %sbase, i32 %voffset, <2 x i16> %data) {
 ; GFX1250-LABEL: flat_store_saddr_i16_d16hi_trunci8_zext_vgpr_offset_neg128:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_store_d16_hi_b8 v0, v1, s[2:3] offset:-128
 ; GFX1250-NEXT:    s_endpgm
   %zext.offset = zext i32 %voffset to i64

diff  --git a/llvm/test/CodeGen/AMDGPU/fmax3.ll b/llvm/test/CodeGen/AMDGPU/fmax3.ll
index 5e6de6d66ccc1..ab9c7f16d54ac 100644
--- a/llvm/test/CodeGen/AMDGPU/fmax3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmax3.ll
@@ -162,6 +162,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: test_fmax3_olt_0_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
@@ -352,6 +353,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: test_fmax3_olt_1_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
@@ -609,6 +611,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-TRUE16-LABEL: test_fmax3_olt_0_f16:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
@@ -639,6 +642,7 @@ define amdgpu_kernel void @test_fmax3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-FAKE16-LABEL: test_fmax3_olt_0_f16:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
@@ -897,6 +901,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-TRUE16-LABEL: test_fmax3_olt_1_f16:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
@@ -927,6 +932,7 @@ define amdgpu_kernel void @test_fmax3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-FAKE16-LABEL: test_fmax3_olt_1_f16:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000

diff  --git a/llvm/test/CodeGen/AMDGPU/fmin3.ll b/llvm/test/CodeGen/AMDGPU/fmin3.ll
index 2756472652bc9..7c2aeeb90bd9e 100644
--- a/llvm/test/CodeGen/AMDGPU/fmin3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmin3.ll
@@ -162,6 +162,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: test_fmin3_olt_0_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
@@ -352,6 +353,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: test_fmin3_olt_1_f32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
@@ -609,6 +611,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-TRUE16-LABEL: test_fmin3_olt_0_f16:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
@@ -639,6 +642,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-FAKE16-LABEL: test_fmin3_olt_0_f16:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
@@ -897,6 +901,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-TRUE16-LABEL: test_fmin3_olt_1_f16:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
@@ -927,6 +932,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f16(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-FAKE16-LABEL: test_fmin3_olt_1_f16:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
@@ -1217,6 +1223,7 @@ define amdgpu_kernel void @test_fmin3_olt_0_f64(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: test_fmin3_olt_0_f64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000
@@ -1426,6 +1433,7 @@ define amdgpu_kernel void @test_fmin3_olt_1_f64(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: test_fmin3_olt_1_f64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-NEXT:    s_mov_b32 s3, 0x31016000

diff  --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll
index 9e5a4428b011f..66d859fbd66ee 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx942.ll
@@ -34,6 +34,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat(ptr %ptr) {
 ;
 ; GFX1250-LABEL: flat_atomic_fadd_f32_noret_pat:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
@@ -75,6 +76,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f32_noret_pat_ieee(ptr %ptr) #0 {
 ;
 ; GFX1250-LABEL: flat_atomic_fadd_f32_noret_pat_ieee:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4.0
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
@@ -153,6 +155,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2f16_noret(ptr addrspace(3) %ptr,
 ;
 ; GFX1250-LABEL: local_atomic_fadd_v2f16_noret:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
@@ -220,6 +223,7 @@ define amdgpu_kernel void @local_atomic_fadd_v2bf16_noret(ptr addrspace(3) %ptr,
 ;
 ; GFX1250-LABEL: local_atomic_fadd_v2bf16_noret:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1

diff  --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 831af7b6c10ba..d1db407cdbe07 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -42,6 +42,7 @@ define amdgpu_kernel void @raw_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, doub
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_add_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -72,6 +73,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, doub
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -115,6 +117,7 @@ define amdgpu_kernel void @raw_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %rsr
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -161,6 +164,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_noret_f64(ptr addrspace(8)
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -191,6 +195,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inreg
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -234,6 +239,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr addrsp
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -280,6 +286,7 @@ define amdgpu_kernel void @struct_buffer_atomic_add_noret_f64(<4 x i32> %rsrc, d
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_add_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -310,6 +317,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_rtn_f64(<4 x i32> inreg %rsrc, d
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -353,6 +361,7 @@ define amdgpu_kernel void @struct_buffer_atomic_add_rtn_f64_off4_slc(<4 x i32> %
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -398,6 +407,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_noret_f64(ptr addrspace(
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -428,6 +438,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_add_rtn_f64(ptr addrspace(8) inr
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_add_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -471,6 +482,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_add_rtn_f64_off4_slc(ptr add
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_add_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -516,6 +528,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, doub
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_min_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -546,6 +559,7 @@ define amdgpu_ps void @raw_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, doub
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -589,6 +603,7 @@ define amdgpu_kernel void @raw_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %rsr
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -635,6 +650,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_noret_f64(ptr addrspace(8)
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -665,6 +681,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inreg
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -708,6 +725,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr addrsp
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -754,6 +772,7 @@ define amdgpu_kernel void @struct_buffer_atomic_min_noret_f64(<4 x i32> %rsrc, d
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_min_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -784,6 +803,7 @@ define amdgpu_ps void @struct_buffer_atomic_min_rtn_f64(<4 x i32> inreg %rsrc, d
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -827,6 +847,7 @@ define amdgpu_kernel void @struct_buffer_atomic_min_rtn_f64_off4_slc(<4 x i32> %
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -872,6 +893,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_noret_f64(ptr addrspace(
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -902,6 +924,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_min_rtn_f64(ptr addrspace(8) inr
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_min_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -945,6 +968,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_min_rtn_f64_off4_slc(ptr add
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_min_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -990,6 +1014,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, doub
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_max_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1020,6 +1045,7 @@ define amdgpu_ps void @raw_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, doub
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -1063,6 +1089,7 @@ define amdgpu_kernel void @raw_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %rsr
 ;
 ; GFX1250-LABEL: raw_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1109,6 +1136,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_noret_f64(ptr addrspace(8)
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1139,6 +1167,7 @@ define amdgpu_ps void @raw_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inreg
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null offen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -1182,6 +1211,7 @@ define amdgpu_kernel void @raw_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr addrsp
 ;
 ; GFX1250-LABEL: raw_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1228,6 +1258,7 @@ define amdgpu_kernel void @struct_buffer_atomic_max_noret_f64(<4 x i32> %rsrc, d
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_max_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1258,6 +1289,7 @@ define amdgpu_ps void @struct_buffer_atomic_max_rtn_f64(<4 x i32> inreg %rsrc, d
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -1301,6 +1333,7 @@ define amdgpu_kernel void @struct_buffer_atomic_max_rtn_f64_off4_slc(<4 x i32> %
 ;
 ; GFX1250-LABEL: struct_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1346,6 +1379,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_noret_f64(ptr addrspace(
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_noret_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1376,6 +1410,7 @@ define amdgpu_ps void @struct_ptr_buffer_atomic_max_rtn_f64(ptr addrspace(8) inr
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    buffer_atomic_max_num_f64 v[0:1], v2, s[0:3], null idxen th:TH_ATOMIC_RETURN
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    flat_store_b64 v[0:1], v[0:1]
@@ -1419,6 +1454,7 @@ define amdgpu_kernel void @struct_ptr_buffer_atomic_max_rtn_f64_off4_slc(ptr add
 ;
 ; GFX1250-LABEL: struct_ptr_buffer_atomic_max_rtn_f64_off4_slc:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1468,6 +1504,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
 ;
 ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -1510,6 +1547,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(
 ;
 ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -1554,6 +1592,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
 ;
 ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_system:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -1596,6 +1635,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(
 ;
 ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_flush:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -1767,6 +1807,7 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrs
 ;
 ; GFX1250-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -1811,6 +1852,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat(ptr %ptr) #1 {
 ;
 ; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -1853,6 +1895,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent(ptr %ptr) #1 {
 ;
 ; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -1897,6 +1940,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_system(ptr %ptr) #1 {
 ;
 ; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_system:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -2070,6 +2114,7 @@ define amdgpu_kernel void @flat_atomic_fadd_f64_noret_pat_agent_safe(ptr %ptr) {
 ;
 ; GFX1250-LABEL: flat_atomic_fadd_f64_noret_pat_agent_safe:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -2110,6 +2155,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret(ptr addrspace(3) %ptr, do
 ;
 ; GFX1250-LABEL: local_atomic_fadd_f64_noret:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x2c
@@ -2181,6 +2227,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr
 ;
 ; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2217,6 +2264,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3
 ;
 ; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2253,6 +2301,7 @@ define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrsp
 ;
 ; GFX1250-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
 ; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4.0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index c28b25c76d241..657e35a90a9a5 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -199,6 +199,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16(
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_f32_to_f16:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -211,12 +212,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16(
 ; GFX1250-SDAG-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX1250-SDAG-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
 ; GFX1250-SDAG-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_f32_to_f16:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -229,18 +232,21 @@ define amdgpu_kernel void @fptrunc_f32_to_f16(
 ; GFX1250-SDAG-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
 ; GFX1250-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cvt_f16_f32 s2, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
@@ -250,12 +256,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16(
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_f32_to_f16:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cvt_f16_f32 s2, s2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
@@ -454,6 +462,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_afn(ptr addrspace(1) %r,
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_f32_to_f16_afn:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -466,12 +475,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_afn(ptr addrspace(1) %r,
 ; GFX1250-SDAG-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX1250-SDAG-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
 ; GFX1250-SDAG-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_f32_to_f16_afn:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -484,18 +495,21 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_afn(ptr addrspace(1) %r,
 ; GFX1250-SDAG-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
 ; GFX1250-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16_afn:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cvt_f16_f32 s2, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
@@ -505,12 +519,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_afn(ptr addrspace(1) %r,
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_f32_to_f16_afn:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cvt_f16_f32 s2, s2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
@@ -1216,6 +1232,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_f64_to_f16:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -1283,6 +1300,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_f64_to_f16:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -1350,6 +1368,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
@@ -1405,6 +1424,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
@@ -1665,6 +1685,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn(
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_f64_to_f16_afn:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -1678,6 +1699,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn(
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-SDAG-TRUE16-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX1250-SDAG-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
@@ -1685,6 +1707,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn(
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_f64_to_f16_afn:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -1698,6 +1721,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn(
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f32_f64_e32 v0, v[0:1]
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
@@ -1705,6 +1729,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn(
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_f64_to_f16_afn:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
@@ -1712,6 +1737,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn(
 ; GFX1250-GISEL-TRUE16-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX1250-GISEL-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -1719,6 +1745,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn(
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_f64_to_f16_afn:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
@@ -1726,6 +1753,7 @@ define amdgpu_kernel void @fptrunc_f64_to_f16_afn(
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f32_f64_e32 v0, s[2:3]
 ; GFX1250-GISEL-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-GISEL-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX1250-GISEL-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -1950,6 +1978,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_v2f32_to_v2f16:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -1968,6 +1997,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_v2f32_to_v2f16:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -1986,6 +2016,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_v2f32_to_v2f16:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
@@ -2000,6 +2031,7 @@ define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_v2f32_to_v2f16:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
@@ -3235,6 +3267,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -3351,6 +3384,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -3467,6 +3501,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
@@ -3566,6 +3601,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
@@ -3910,6 +3946,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -3931,6 +3968,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -3952,6 +3990,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
@@ -3961,6 +4000,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
 ; GFX1250-GISEL-TRUE16-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-GISEL-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX1250-GISEL-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
@@ -3971,6 +4011,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_v2f64_to_v2f16_afn:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
@@ -3980,6 +4021,7 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16_afn(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f32_f64_e32 v0, s[4:5]
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f32_f64_e32 v1, s[6:7]
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
@@ -4179,6 +4221,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: fneg_fptrunc_f32_to_f16:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -4192,6 +4235,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-SDAG-TRUE16-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX1250-SDAG-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
@@ -4199,6 +4243,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: fneg_fptrunc_f32_to_f16:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -4212,6 +4257,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-SDAG-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x80000000, v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
@@ -4219,6 +4265,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: fneg_fptrunc_f32_to_f16:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -4226,8 +4273,9 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_xor_b32 s2, s2, 0x80000000
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cvt_f16_f32 s2, s2
+; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-GISEL-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -4235,6 +4283,7 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: fneg_fptrunc_f32_to_f16:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -4242,8 +4291,9 @@ define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_xor_b32 s2, s2, 0x80000000
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cvt_f16_f32 s2, s2
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-GISEL-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -4441,6 +4491,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: fabs_fptrunc_f32_to_f16:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -4454,6 +4505,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX1250-SDAG-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
@@ -4461,6 +4513,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: fabs_fptrunc_f32_to_f16:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -4474,6 +4527,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
@@ -4481,6 +4535,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: fabs_fptrunc_f32_to_f16:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -4488,8 +4543,9 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_bitset0_b32 s2, 31
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cvt_f16_f32 s2, s2
+; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-GISEL-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -4497,6 +4553,7 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: fabs_fptrunc_f32_to_f16:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -4504,8 +4561,9 @@ define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_bitset0_b32 s2, 31
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cvt_f16_f32 s2, s2
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-GISEL-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -4703,6 +4761,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: fneg_fabs_fptrunc_f32_to_f16:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -4716,6 +4775,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-SDAG-TRUE16-NEXT:    v_or_b32_e32 v0, 0x80000000, v0
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX1250-SDAG-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
@@ -4723,6 +4783,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: fneg_fabs_fptrunc_f32_to_f16:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -4736,6 +4797,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-SDAG-FAKE16-NEXT:    v_or_b32_e32 v0, 0x80000000, v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], null
@@ -4743,6 +4805,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: fneg_fabs_fptrunc_f32_to_f16:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -4750,8 +4813,9 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_bitset1_b32 s2, 31
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cvt_f16_f32 s2, s2
+; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-GISEL-TRUE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -4759,6 +4823,7 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: fneg_fabs_fptrunc_f32_to_f16:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -4766,8 +4831,9 @@ define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_bitset1_b32 s2, 31
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cvt_f16_f32 s2, s2
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-GISEL-FAKE16-NEXT:    buffer_store_b16 v0, off, s[0:3], null
@@ -4973,6 +5039,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_f32_to_f16_zext_i32:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -4985,6 +5052,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
 ; GFX1250-SDAG-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX1250-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -4993,6 +5061,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_f32_to_f16_zext_i32:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -5005,6 +5074,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
 ; GFX1250-SDAG-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -5013,12 +5083,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16_zext_i32:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cvt_f16_f32 s2, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s2, 0xffff, s2
@@ -5029,12 +5101,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_f32_to_f16_zext_i32:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cvt_f16_f32 s2, s2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s2, 0xffff, s2
@@ -5243,6 +5317,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -5256,6 +5331,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX1250-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -5264,6 +5340,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -5277,6 +5354,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0x7fffffff, v0
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -5285,6 +5363,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -5292,10 +5371,10 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_bitset0_b32 s2, 31
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cvt_f16_f32 s2, s2
+; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-GISEL-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], null
@@ -5303,6 +5382,7 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_fabs_f32_to_f16_zext_i32:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -5310,10 +5390,10 @@ define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_bitset0_b32 s2, 31
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_3)
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cvt_f16_f32 s2, s2
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_mov_b32 s2, -1
 ; GFX1250-GISEL-FAKE16-NEXT:    buffer_store_b32 v0, off, s[0:3], null
@@ -5529,6 +5609,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
 ;
 ; GFX1250-SDAG-TRUE16-LABEL: fptrunc_f32_to_f16_sext_i32:
 ; GFX1250-SDAG-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -5541,6 +5622,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
 ; GFX1250-SDAG-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null
 ; GFX1250-SDAG-TRUE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-TRUE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX1250-SDAG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-TRUE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
@@ -5549,6 +5631,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: fptrunc_f32_to_f16_sext_i32:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -5561,6 +5644,7 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
 ; GFX1250-SDAG-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], null
 ; GFX1250-SDAG-FAKE16-NEXT:    s_mov_b32 s5, s1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_bfe_i32 v0, v0, 0, 16
@@ -5569,12 +5653,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: fptrunc_f32_to_f16_sext_i32:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cvt_f16_f32 s2, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    s_sext_i32_i16 s2, s2
@@ -5585,12 +5671,14 @@ define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: fptrunc_f32_to_f16_sext_i32:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0: ; %entry
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cvt_f16_f32 s2, s2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    s_sext_i32_i16 s2, s2

diff  --git a/llvm/test/CodeGen/AMDGPU/global-address.ll b/llvm/test/CodeGen/AMDGPU/global-address.ll
index bcded5221bf43..f3db48edbb6bb 100644
--- a/llvm/test/CodeGen/AMDGPU/global-address.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-address.ll
@@ -27,8 +27,10 @@ define amdgpu_kernel void @caller_internal() {
 ;
 ; GFX1250-PAL-LABEL: caller_internal:
 ; GFX1250-PAL:       ; %bb.0:
-; GFX1250-PAL-NEXT:    s_mov_b64 s[0:1], internal_func at abs64
 ; GFX1250-PAL-NEXT:    s_mov_b32 s32, 0
+; GFX1250-PAL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-PAL-NEXT:    s_mov_b64 s[0:1], internal_func at abs64
+; GFX1250-PAL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-PAL-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
 ; GFX1250-PAL-NEXT:    s_endpgm
 ;
@@ -45,9 +47,10 @@ define amdgpu_kernel void @caller_internal() {
 ;
 ; GFX1250-HSA-LABEL: caller_internal:
 ; GFX1250-HSA:       ; %bb.0:
+; GFX1250-HSA-NEXT:    s_mov_b32 s32, 0
+; GFX1250-HSA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-HSA-NEXT:    s_get_pc_i64 s[0:1]
 ; GFX1250-HSA-NEXT:    s_add_nc_u64 s[0:1], s[0:1], internal_func at gotpcrel+4
-; GFX1250-HSA-NEXT:    s_mov_b32 s32, 0
 ; GFX1250-HSA-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; GFX1250-HSA-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-HSA-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
@@ -91,13 +94,14 @@ define amdgpu_kernel void @caller_exterinal() {
 ;
 ; GFX1250-PAL-LABEL: caller_exterinal:
 ; GFX1250-PAL:       ; %bb.0:
+; GFX1250-PAL-NEXT:    s_mov_b32 s32, 0
+; GFX1250-PAL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-PAL-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1250-PAL-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1250-PAL-NEXT:    s_mov_b64 s[8:9], s[4:5]
 ; GFX1250-PAL-NEXT:    s_mov_b64 s[12:13], external_func at abs64
 ; GFX1250-PAL-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1250-PAL-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1250-PAL-NEXT:    s_mov_b32 s32, 0
 ; GFX1250-PAL-NEXT:    s_swap_pc_i64 s[30:31], s[12:13]
 ; GFX1250-PAL-NEXT:    s_endpgm
 ;
@@ -120,6 +124,8 @@ define amdgpu_kernel void @caller_exterinal() {
 ;
 ; GFX1250-HSA-LABEL: caller_exterinal:
 ; GFX1250-HSA:       ; %bb.0:
+; GFX1250-HSA-NEXT:    s_mov_b32 s32, 0
+; GFX1250-HSA-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-HSA-NEXT:    v_mov_b32_e32 v31, v0
 ; GFX1250-HSA-NEXT:    s_mov_b64 s[10:11], s[6:7]
 ; GFX1250-HSA-NEXT:    s_mov_b64 s[8:9], s[4:5]
@@ -127,7 +133,6 @@ define amdgpu_kernel void @caller_exterinal() {
 ; GFX1250-HSA-NEXT:    s_add_nc_u64 s[12:13], s[12:13], external_func at rel64+4
 ; GFX1250-HSA-NEXT:    s_mov_b64 s[4:5], s[0:1]
 ; GFX1250-HSA-NEXT:    s_mov_b64 s[6:7], s[2:3]
-; GFX1250-HSA-NEXT:    s_mov_b32 s32, 0
 ; GFX1250-HSA-NEXT:    s_swap_pc_i64 s[30:31], s[12:13]
 ; GFX1250-HSA-NEXT:    s_endpgm
   call void @external_func()

diff  --git a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
index e59af2e82671e..4a4fbadc41e46 100644
--- a/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-load-xcnt.ll
@@ -404,6 +404,7 @@ define i64 @test_v16i64_load_store(ptr addrspace(1) %ptr_a, ptr addrspace(1) %pt
 define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1, ptr addrspace(1) %ptr2, ptr addrspace(1) %out) {
 ; GCN-SDAG-LABEL: test_v7i16_load_store_kernel:
 ; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GCN-SDAG-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
 ; GCN-SDAG-NEXT:    s_wait_xcnt 0x0
@@ -431,6 +432,7 @@ define amdgpu_kernel void @test_v7i16_load_store_kernel(ptr addrspace(1) %ptr1,
 ;
 ; GCN-GISEL-LABEL: test_v7i16_load_store_kernel:
 ; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GCN-GISEL-NEXT:    v_and_b32_e32 v8, 0x3ff, v0
 ; GCN-GISEL-NEXT:    s_wait_xcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index b81fdd36530da..277aac55f7933 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -65,6 +65,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
 ;
 ; GFX1250-LABEL: s_insertelement_v2bf16_0:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -139,6 +140,7 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
 ;
 ; GFX1250-LABEL: s_insertelement_v2bf16_1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -222,6 +224,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
 ;
 ; GFX1250-LABEL: v_insertelement_v2bf16_0:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -308,6 +311,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %
 ;
 ; GFX1250-LABEL: v_insertelement_v2bf16_0_inlineimm:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -394,6 +398,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
 ;
 ; GFX1250-LABEL: v_insertelement_v2bf16_1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -480,6 +485,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %
 ;
 ; GFX1250-LABEL: v_insertelement_v2bf16_1_inlineimm:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -591,6 +597,7 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1)
 ;
 ; GFX1250-LABEL: v_insertelement_v2bf16_dynamic_vgpr:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
@@ -693,6 +700,7 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a
 ;
 ; GFX1250-LABEL: v_insertelement_v4bf16_0:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -787,6 +795,7 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a
 ;
 ; GFX1250-LABEL: v_insertelement_v4bf16_1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -883,6 +892,7 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a
 ;
 ; GFX1250-LABEL: v_insertelement_v4bf16_2:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -977,6 +987,7 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a
 ;
 ; GFX1250-LABEL: v_insertelement_v4bf16_3:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -1094,6 +1105,7 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1)
 ;
 ; GFX1250-LABEL: v_insertelement_v4bf16_dynamic_sgpr:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -1194,6 +1206,7 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a
 ;
 ; GFX1250-LABEL: v_insertelement_v8bf16_3:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -1423,6 +1436,7 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out,
 ;
 ; GFX1250-LABEL: v_insertelement_v8bf16_dynamic:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -1565,6 +1579,7 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
 ;
 ; GFX1250-LABEL: v_insertelement_v16bf16_3:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0
@@ -1958,6 +1973,7 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: v_insertelement_v16bf16_dynamic:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_xcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/literal64.ll b/llvm/test/CodeGen/AMDGPU/literal64.ll
index 20b876836082e..3b1b0f5ac9897 100644
--- a/llvm/test/CodeGen/AMDGPU/literal64.ll
+++ b/llvm/test/CodeGen/AMDGPU/literal64.ll
@@ -5,6 +5,7 @@
 define amdgpu_ps i64 @s_add_u64(i64 inreg %a) {
 ; GCN-LABEL: s_add_u64:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0xf12345678
 ; GCN-NEXT:    ; return to shader part epilog
   %result = add i64 %a, 64729929336
@@ -14,6 +15,7 @@ define amdgpu_ps i64 @s_add_u64(i64 inreg %a) {
 define amdgpu_ps void @v_add_u64(i64 %a, ptr addrspace(1) %out) {
 ; GCN-LABEL: v_add_u64:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_add_nc_u64_e32 v[0:1], 0xf12345678, v[0:1]
 ; GCN-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GCN-NEXT:    s_endpgm
@@ -25,6 +27,7 @@ define amdgpu_ps void @v_add_u64(i64 %a, ptr addrspace(1) %out) {
 define amdgpu_ps i64 @s_add_neg_u64(i64 inreg %a) {
 ; GCN-LABEL: s_add_neg_u64:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0xfffffff0edcba988
 ; GCN-NEXT:    ; return to shader part epilog
   %result = sub i64 %a, 64729929336
@@ -34,6 +37,7 @@ define amdgpu_ps i64 @s_add_neg_u64(i64 inreg %a) {
 define amdgpu_ps void @v_add_neg_u64(i64 %a, ptr addrspace(1) %out) {
 ; GCN-LABEL: v_add_neg_u64:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_add_nc_u64_e32 v[0:1], 0xfffffff0edcba988, v[0:1]
 ; GCN-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GCN-NEXT:    s_endpgm
@@ -45,6 +49,7 @@ define amdgpu_ps void @v_add_neg_u64(i64 %a, ptr addrspace(1) %out) {
 define amdgpu_ps i64 @s_sub_u64(i64 inreg %a) {
 ; GCN-LABEL: s_sub_u64:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_sub_nc_u64 s[0:1], 0xf12345678, s[0:1]
 ; GCN-NEXT:    ; return to shader part epilog
   %result = sub i64 64729929336, %a
@@ -54,6 +59,7 @@ define amdgpu_ps i64 @s_sub_u64(i64 inreg %a) {
 define amdgpu_ps void @v_sub_u64(i64 %a, ptr addrspace(1) %out) {
 ; GCN-LABEL: v_sub_u64:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_sub_nc_u64_e32 v[0:1], 0xf12345678, v[0:1]
 ; GCN-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GCN-NEXT:    s_endpgm
@@ -140,6 +146,7 @@ define double @rsq_f64() {
 define amdgpu_ps i64 @s_and_b64(i64 inreg %a) {
 ; GCN-LABEL: s_and_b64:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_and_b64 s[0:1], s[0:1], 0xf12345678
 ; GCN-NEXT:    ; return to shader part epilog
   %result = and i64 %a, 64729929336
@@ -151,6 +158,7 @@ define amdgpu_ps i64 @s_and_b64(i64 inreg %a) {
 define amdgpu_ps void @v_and_b64(i64 %a, ptr addrspace(1) %out) {
 ; GCN-SDAG-LABEL: v_and_b64:
 ; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-SDAG-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GCN-SDAG-NEXT:    v_and_b32_e32 v0, 0x12345678, v0
 ; GCN-SDAG-NEXT:    global_store_b64 v[2:3], v[0:1], off
@@ -158,6 +166,7 @@ define amdgpu_ps void @v_and_b64(i64 %a, ptr addrspace(1) %out) {
 ;
 ; GCN-GISEL-LABEL: v_and_b64:
 ; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-GISEL-NEXT:    v_and_b32_e32 v0, 0x12345678, v0
 ; GCN-GISEL-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GCN-GISEL-NEXT:    global_store_b64 v[2:3], v[0:1], off
@@ -170,6 +179,8 @@ define amdgpu_ps void @v_and_b64(i64 %a, ptr addrspace(1) %out) {
 define amdgpu_ps <2 x float> @v_add_f64_200.1(double %a) {
 ; GCN-LABEL: v_add_f64_200.1:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GCN-NEXT:    v_add_f64_e32 v[0:1], 0x4069033333333333, v[0:1]
 ; GCN-NEXT:    ; return to shader part epilog
   %add = fadd double %a, 200.1
@@ -182,6 +193,8 @@ define amdgpu_ps <2 x float> @v_add_f64_200.1(double %a) {
 define amdgpu_ps <2 x float> @v_add_f64_200.0(double %a) {
 ; GCN-LABEL: v_add_f64_200.0:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GCN-NEXT:    v_add_f64_e32 v[0:1], 0x40690000, v[0:1]
 ; GCN-NEXT:    ; return to shader part epilog
   %add = fadd double %a, 200.0
@@ -194,6 +207,7 @@ define amdgpu_ps <2 x float> @v_add_f64_200.0(double %a) {
 define amdgpu_ps <2 x float> @v_lshl_add_u64(i64 %a) {
 ; GCN-SDAG-LABEL: v_lshl_add_u64:
 ; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-SDAG-NEXT:    s_mov_b64 s[0:1], 0xf12345678
 ; GCN-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GCN-SDAG-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 1, s[0:1]
@@ -201,6 +215,7 @@ define amdgpu_ps <2 x float> @v_lshl_add_u64(i64 %a) {
 ;
 ; GCN-GISEL-LABEL: v_lshl_add_u64:
 ; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[2:3], 0xf12345678
 ; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-GISEL-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 1, v[2:3]
@@ -216,6 +231,7 @@ define amdgpu_ps <2 x float> @v_lshl_add_u64(i64 %a) {
 define amdgpu_ps <2 x float> @v_fma_f64(double %a, double %b) {
 ; GCN-SDAG-LABEL: v_fma_f64:
 ; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-SDAG-NEXT:    v_fmaak_f64 v[4:5], v[0:1], v[2:3], 0x4063233333333333
 ; GCN-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0x4069033333333333
 ; GCN-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
@@ -227,6 +243,7 @@ define amdgpu_ps <2 x float> @v_fma_f64(double %a, double %b) {
 ;
 ; GCN-GISEL-LABEL: v_fma_f64:
 ; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[4:5], 0x4063233333333333
 ; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GCN-GISEL-NEXT:    v_fmac_f64_e32 v[4:5], v[0:1], v[2:3]
@@ -246,15 +263,18 @@ define amdgpu_ps <2 x float> @v_fma_f64(double %a, double %b) {
 define amdgpu_ps <2 x float> @v_add_neg_f64(double %a) {
 ; GCN-SDAG-LABEL: v_add_neg_f64:
 ; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-SDAG-NEXT:    s_mov_b64 s[0:1], 0x4069033333333333
-; GCN-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GCN-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GCN-SDAG-NEXT:    v_add_f64_e64 v[0:1], -v[0:1], s[0:1]
 ; GCN-SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GCN-GISEL-LABEL: v_add_neg_f64:
 ; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-GISEL-NEXT:    v_max_num_f64_e32 v[0:1], v[0:1], v[0:1]
 ; GCN-GISEL-NEXT:    v_mov_b64_e32 v[2:3], 0x4069033333333333
+; GCN-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GCN-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-GISEL-NEXT:    v_add_f64_e64 v[0:1], -v[0:1], v[2:3]
 ; GCN-GISEL-NEXT:    ; return to shader part epilog
@@ -267,6 +287,7 @@ define amdgpu_ps <2 x float> @v_add_neg_f64(double %a) {
 define amdgpu_ps <2 x float> @v_cndmask(double %a) {
 ; GCN-SDAG-LABEL: v_cndmask:
 ; GCN-SDAG:       ; %bb.0:
+; GCN-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-SDAG-NEXT:    v_cmp_eq_f64_e32 vcc_lo, 0, v[0:1]
 ; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, 0x40632000
 ; GCN-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0x33333333, 0, vcc_lo
@@ -276,6 +297,7 @@ define amdgpu_ps <2 x float> @v_cndmask(double %a) {
 ;
 ; GCN-GISEL-LABEL: v_cndmask:
 ; GCN-GISEL:       ; %bb.0:
+; GCN-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-GISEL-NEXT:    v_cmp_eq_f64_e32 vcc_lo, 0, v[0:1]
 ; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0x40690333
 ; GCN-GISEL-NEXT:    v_cndmask_b32_e64 v0, 0x33333333, 0, vcc_lo

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll
index c985e76422e97..5b32e1d75cfeb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.bitop3.ll
@@ -10,20 +10,32 @@ declare i32 @llvm.amdgcn.bitop3.i32(i32, i32, i32, i32)
 declare i16 @llvm.amdgcn.bitop3.i16(i16, i16, i16, i32)
 
 define amdgpu_ps float @bitop3_b32_vvv(i32 %a, i32 %b, i32 %c) {
-; GCN-LABEL: bitop3_b32_vvv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0xf
-; GCN-NEXT:    ; return to shader part epilog
+; GFX950-LABEL: bitop3_b32_vvv:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0xf
+; GFX950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: bitop3_b32_vvv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_bitop3_b32 v0, v0, v1, v2 bitop3:0xf
+; GFX1250-NEXT:    ; return to shader part epilog
   %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 %c, i32 15)
   %ret_cast = bitcast i32 %ret to float
   ret float %ret_cast
 }
 
 define amdgpu_ps float @bitop3_b32_svv(i32 inreg %a, i32 %b, i32 %c) {
-; GCN-LABEL: bitop3_b32_svv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_bitop3_b32 v0, s0, v0, v1 bitop3:0x10
-; GCN-NEXT:    ; return to shader part epilog
+; GFX950-LABEL: bitop3_b32_svv:
+; GFX950:       ; %bb.0:
+; GFX950-NEXT:    v_bitop3_b32 v0, s0, v0, v1 bitop3:0x10
+; GFX950-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: bitop3_b32_svv:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_bitop3_b32 v0, s0, v0, v1 bitop3:0x10
+; GFX1250-NEXT:    ; return to shader part epilog
   %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 %c, i32 16)
   %ret_cast = bitcast i32 %ret to float
   ret float %ret_cast
@@ -38,6 +50,7 @@ define amdgpu_ps float @bitop3_b32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
 ;
 ; GFX1250-LABEL: bitop3_b32_ssv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_bitop3_b32 v0, s0, s1, v0 bitop3:0x11
 ; GFX1250-NEXT:    ; return to shader part epilog
   %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 %c, i32 17)
@@ -55,6 +68,7 @@ define amdgpu_ps float @bitop3_b32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c)
 ;
 ; GFX1250-LABEL: bitop3_b32_sss:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_bitop3_b32 v0, s0, s1, v0 bitop3:0x12
@@ -79,6 +93,7 @@ define amdgpu_ps float @bitop3_b32_vvi(i32 %a, i32 %b) {
 ;
 ; GFX1250-LABEL: bitop3_b32_vvi:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_bitop3_b32 v0, v0, v1, 0x3e8 bitop3:0x13
 ; GFX1250-NEXT:    ; return to shader part epilog
   %ret = call i32 @llvm.amdgcn.bitop3.i32(i32 %a, i32 %b, i32 1000, i32 19)
@@ -103,6 +118,7 @@ define amdgpu_ps float @bitop3_b32_vii(i32 %a) {
 ;
 ; GFX1250-SDAG-LABEL: bitop3_b32_vii:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_movk_i32 s0, 0x7d0
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    v_bitop3_b32 v0, v0, s0, 0x3e8 bitop3:0x14
@@ -110,6 +126,7 @@ define amdgpu_ps float @bitop3_b32_vii(i32 %a) {
 ;
 ; GFX1250-GISEL-LABEL: bitop3_b32_vii:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e8
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_bitop3_b32 v0, v0, 0x7d0, v1 bitop3:0x14
@@ -140,6 +157,7 @@ define amdgpu_ps float @bitop3_b32_iii() {
 ;
 ; GFX1250-SDAG-LABEL: bitop3_b32_iii:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v0, 0x3e8
 ; GFX1250-SDAG-NEXT:    s_movk_i32 s0, 0xbb8
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -148,6 +166,7 @@ define amdgpu_ps float @bitop3_b32_iii() {
 ;
 ; GFX1250-GISEL-LABEL: bitop3_b32_iii:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v0, 0x7d0
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0x3e8
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -166,11 +185,13 @@ define amdgpu_ps half @bitop3_b16_vvv(i16 %a, i16 %b, i16 %c) {
 ;
 ; GFX1250-TRUE16-LABEL: bitop3_b16_vvv:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v1.l, v2.l bitop3:0xf
 ; GFX1250-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-FAKE16-LABEL: bitop3_b16_vvv:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v1, v2 bitop3:0xf
 ; GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 %c, i32 15)
@@ -186,11 +207,13 @@ define amdgpu_ps half @bitop3_b16_svv(i16 inreg %a, i16 %b, i16 %c) {
 ;
 ; GFX1250-TRUE16-LABEL: bitop3_b16_svv:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.l, s0, v0.l, v1.l bitop3:0x10
 ; GFX1250-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-FAKE16-LABEL: bitop3_b16_svv:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v0, s0, v0, v1 bitop3:0x10
 ; GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 %c, i32 16)
@@ -207,11 +230,13 @@ define amdgpu_ps half @bitop3_b16_ssv(i16 inreg %a, i16 inreg %b, i16 %c) {
 ;
 ; GFX1250-TRUE16-LABEL: bitop3_b16_ssv:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.l, s0, s1, v0.l bitop3:0x11
 ; GFX1250-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-FAKE16-LABEL: bitop3_b16_ssv:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v0, s0, s1, v0 bitop3:0x11
 ; GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 %c, i32 17)
@@ -229,6 +254,7 @@ define amdgpu_ps half @bitop3_b16_sss(i16 inreg %a, i16 inreg %b, i16 inreg %c)
 ;
 ; GFX1250-TRUE16-LABEL: bitop3_b16_sss:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
 ; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.l, s0, s1, v0.l bitop3:0x12
@@ -236,6 +262,7 @@ define amdgpu_ps half @bitop3_b16_sss(i16 inreg %a, i16 inreg %b, i16 inreg %c)
 ;
 ; GFX1250-FAKE16-LABEL: bitop3_b16_sss:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v0, s0, s1, v0 bitop3:0x12
@@ -260,11 +287,13 @@ define amdgpu_ps half @bitop3_b16_vvi(i16 %a, i16 %b) {
 ;
 ; GFX1250-TRUE16-LABEL: bitop3_b16_vvi:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v1.l, 0x3e8 bitop3:0x13
 ; GFX1250-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-FAKE16-LABEL: bitop3_b16_vvi:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v1, 0x3e8 bitop3:0x13
 ; GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %ret = call i16 @llvm.amdgcn.bitop3.i16(i16 %a, i16 %b, i16 1000, i32 19)
@@ -289,6 +318,7 @@ define amdgpu_ps half @bitop3_b16_vii(i16 %a) {
 ;
 ; GFX1250-SDG-TRUE16-LABEL: bitop3_b16_vii:
 ; GFX1250-SDG-TRUE16:       ; %bb.0:
+; GFX1250-SDG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDG-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0x7d0
 ; GFX1250-SDG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDG-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v1.l, 0x3e8 bitop3:0x14
@@ -296,6 +326,7 @@ define amdgpu_ps half @bitop3_b16_vii(i16 %a) {
 ;
 ; GFX1250-SDG-FAKE16-LABEL: bitop3_b16_vii:
 ; GFX1250-SDG-FAKE16:       ; %bb.0:
+; GFX1250-SDG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDG-FAKE16-NEXT:    s_movk_i32 s0, 0x7d0
 ; GFX1250-SDG-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDG-FAKE16-NEXT:    v_bitop3_b16 v0, v0, s0, 0x3e8 bitop3:0x14
@@ -303,6 +334,7 @@ define amdgpu_ps half @bitop3_b16_vii(i16 %a) {
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: bitop3_b16_vii:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0:
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0x3e8
 ; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, 0x7d0, v0.h bitop3:0x14
@@ -310,6 +342,7 @@ define amdgpu_ps half @bitop3_b16_vii(i16 %a) {
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: bitop3_b16_vii:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0x3e8
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_bitop3_b16 v0, v0, 0x7d0, v1 bitop3:0x14
@@ -339,6 +372,7 @@ define amdgpu_ps half @bitop3_b16_iii() {
 ;
 ; GFX1250-SDG-TRUE16-LABEL: bitop3_b16_iii:
 ; GFX1250-SDG-TRUE16:       ; %bb.0:
+; GFX1250-SDG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x7d0
 ; GFX1250-SDG-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0xbb8
 ; GFX1250-SDG-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -347,6 +381,7 @@ define amdgpu_ps half @bitop3_b16_iii() {
 ;
 ; GFX1250-SDG-FAKE16-LABEL: bitop3_b16_iii:
 ; GFX1250-SDG-FAKE16:       ; %bb.0:
+; GFX1250-SDG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x3e8
 ; GFX1250-SDG-FAKE16-NEXT:    s_movk_i32 s0, 0xbb8
 ; GFX1250-SDG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
@@ -355,6 +390,7 @@ define amdgpu_ps half @bitop3_b16_iii() {
 ;
 ; GFX1250-GISEL-TRUE16-LABEL: bitop3_b16_iii:
 ; GFX1250-GISEL-TRUE16:       ; %bb.0:
+; GFX1250-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x7d0
 ; GFX1250-GISEL-TRUE16-NEXT:    v_mov_b16_e32 v0.h, 0x3e8
 ; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -363,6 +399,7 @@ define amdgpu_ps half @bitop3_b16_iii() {
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: bitop3_b16_iii:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v0, 0x7d0
 ; GFX1250-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0x3e8
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -372,3 +409,5 @@ define amdgpu_ps half @bitop3_b16_iii() {
   %ret_cast = bitcast i16 %ret to half
   ret half %ret_cast
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll
index fa97380583798..aa762c8275fdc 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.id.ll
@@ -11,6 +11,7 @@ declare i32 @llvm.amdgcn.cluster.id.z() #0
 define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) {
 ; CHECK-UNKNOWN-LABEL: test_cluster_id_x:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
 ; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
@@ -87,6 +88,7 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) {
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
 ; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
@@ -95,6 +97,7 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) {
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_cluster_id_x:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
 ; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
@@ -171,6 +174,7 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) {
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
 ; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
@@ -184,6 +188,7 @@ define amdgpu_kernel void @test_cluster_id_x(ptr addrspace(1) %out) {
 define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 {
 ; CHECK-UNKNOWN-LABEL: test_cluster_id_y:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0
 ; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
@@ -260,6 +265,7 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 {
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0
 ; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
@@ -268,6 +274,7 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 {
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_cluster_id_y:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0
 ; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
@@ -344,6 +351,7 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 {
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v0, ttmp7 :: v_dual_mov_b32 v1, 0
 ; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
@@ -357,6 +365,7 @@ define amdgpu_kernel void @test_cluster_id_y(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 {
 ; CHECK-UNKNOWN-LABEL: test_cluster_id_z:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; CHECK-UNKNOWN-NEXT:    s_wait_xcnt 0x0
 ; CHECK-UNKNOWN-NEXT:    s_lshr_b32 s0, ttmp7, 16
@@ -436,6 +445,7 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 {
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; CHECK-MESA3D-NEXT:    s_lshr_b32 s2, ttmp7, 16
 ; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -446,6 +456,7 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 {
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_cluster_id_z:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[2:3], s[0:1], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    s_wait_xcnt 0x0
 ; CHECK-G-UNKNOWN-NEXT:    s_lshr_b32 s0, ttmp7, 16
@@ -525,6 +536,7 @@ define amdgpu_kernel void @test_cluster_id_z(ptr addrspace(1) %out) #1 {
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
 ; CHECK-G-MESA3D-NEXT:    s_lshr_b32 s2, ttmp7, 16
 ; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll
index 3019c1d897d98..7f4d35916668d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.async.to.lds.ll
@@ -10,6 +10,7 @@ declare void @llvm.amdgcn.cluster.load.async.to.lds.b128(ptr addrspace(1) %gaddr
 define amdgpu_ps void @cluster_load_async_to_lds_b8_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %mask) {
 ; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b8_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
 ; GFX1250-SDAG-NEXT:    s_mov_b32 m0, s0
@@ -18,6 +19,7 @@ define amdgpu_ps void @cluster_load_async_to_lds_b8_vaddr(ptr addrspace(1) %gadd
 ;
 ; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b8_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -34,6 +36,7 @@ entry:
 define amdgpu_ps void @cluster_load_async_to_lds_b8_vaddr_imm_mask(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b8_vaddr_imm_mask:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b32 m0, 15
 ; GFX1250-SDAG-NEXT:    cluster_load_async_to_lds_b8 v2, v[0:1], off offset:16
@@ -41,6 +44,7 @@ define amdgpu_ps void @cluster_load_async_to_lds_b8_vaddr_imm_mask(ptr addrspace
 ;
 ; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b8_vaddr_imm_mask:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -56,6 +60,7 @@ entry:
 define amdgpu_ps void @cluster_load_async_to_lds_b8_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask) {
 ; GFX1250-LABEL: cluster_load_async_to_lds_b8_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    s_mov_b32 m0, s2
 ; GFX1250-NEXT:    cluster_load_async_to_lds_b8 v0, v1, s[0:1] offset:16
@@ -69,6 +74,7 @@ entry:
 define amdgpu_ps void @cluster_load_async_to_lds_b32_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %mask) {
 ; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b32_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
 ; GFX1250-SDAG-NEXT:    s_mov_b32 m0, s0
@@ -77,6 +83,7 @@ define amdgpu_ps void @cluster_load_async_to_lds_b32_vaddr(ptr addrspace(1) %gad
 ;
 ; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b32_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -93,6 +100,7 @@ entry:
 define amdgpu_ps void @cluster_load_async_to_lds_b32_vaddr_imm_mask(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b32_vaddr_imm_mask:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    s_mov_b32 m0, 15
 ; GFX1250-SDAG-NEXT:    cluster_load_async_to_lds_b32 v2, v[0:1], off offset:16
@@ -100,6 +108,7 @@ define amdgpu_ps void @cluster_load_async_to_lds_b32_vaddr_imm_mask(ptr addrspac
 ;
 ; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b32_vaddr_imm_mask:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -115,6 +124,7 @@ entry:
 define amdgpu_ps void @cluster_load_async_to_lds_b32_saddr( ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask) {
 ; GFX1250-LABEL: cluster_load_async_to_lds_b32_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    s_mov_b32 m0, s2
 ; GFX1250-NEXT:    cluster_load_async_to_lds_b32 v0, v1, s[0:1] offset:16
@@ -128,6 +138,7 @@ entry:
 define amdgpu_ps void @cluster_load_async_to_lds_b64_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %mask) {
 ; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b64_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
 ; GFX1250-SDAG-NEXT:    s_mov_b32 m0, s0
@@ -136,6 +147,7 @@ define amdgpu_ps void @cluster_load_async_to_lds_b64_vaddr(ptr addrspace(1) %gad
 ;
 ; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b64_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -152,6 +164,7 @@ entry:
 define amdgpu_ps void @cluster_load_async_to_lds_b64_vaddr_imm_mask( ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b64_vaddr_imm_mask:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    s_movk_i32 m0, 0x7f
 ; GFX1250-SDAG-NEXT:    cluster_load_async_to_lds_b64 v2, v[0:1], off offset:16
@@ -159,6 +172,7 @@ define amdgpu_ps void @cluster_load_async_to_lds_b64_vaddr_imm_mask( ptr addrspa
 ;
 ; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b64_vaddr_imm_mask:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -174,6 +188,7 @@ entry:
 define amdgpu_ps void @cluster_load_async_to_lds_b64_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask) {
 ; GFX1250-LABEL: cluster_load_async_to_lds_b64_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    s_mov_b32 m0, s2
 ; GFX1250-NEXT:    cluster_load_async_to_lds_b64 v0, v1, s[0:1] offset:16
@@ -187,6 +202,7 @@ entry:
 define amdgpu_ps void @cluster_load_async_to_lds_b128_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr, i32 %mask) {
 ; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b128_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v3
 ; GFX1250-SDAG-NEXT:    s_mov_b32 m0, s0
@@ -195,6 +211,7 @@ define amdgpu_ps void @cluster_load_async_to_lds_b128_vaddr(ptr addrspace(1) %ga
 ;
 ; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b128_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s0, v3
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -211,6 +228,7 @@ entry:
 define amdgpu_ps void @cluster_load_async_to_lds_b128_vaddr_imm_mask(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: cluster_load_async_to_lds_b128_vaddr_imm_mask:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    s_movk_i32 m0, 0x7f
 ; GFX1250-SDAG-NEXT:    cluster_load_async_to_lds_b128 v2, v[0:1], off offset:16
@@ -218,6 +236,7 @@ define amdgpu_ps void @cluster_load_async_to_lds_b128_vaddr_imm_mask(ptr addrspa
 ;
 ; GFX1250-GISEL-LABEL: cluster_load_async_to_lds_b128_vaddr_imm_mask:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -233,6 +252,7 @@ entry:
 define amdgpu_ps void @cluster_load_async_to_lds_b128_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask) {
 ; GFX1250-LABEL: cluster_load_async_to_lds_b128_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    s_mov_b32 m0, s2
 ; GFX1250-NEXT:    cluster_load_async_to_lds_b128 v0, v1, s[0:1] offset:16
@@ -246,6 +266,7 @@ entry:
 define amdgpu_ps void @cluster_load_async_to_lds_b32_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask, i32 %idx) {
 ; GFX1250-LABEL: cluster_load_async_to_lds_b32_saddr_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 m0, s2
 ; GFX1250-NEXT:    cluster_load_async_to_lds_b32 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT
 ; GFX1250-NEXT:    s_endpgm
@@ -259,6 +280,7 @@ entry:
 define amdgpu_ps void @cluster_load_async_to_lds_b64_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask, i32 %idx) {
 ; GFX1250-LABEL: cluster_load_async_to_lds_b64_saddr_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 m0, s2
 ; GFX1250-NEXT:    cluster_load_async_to_lds_b64 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT
 ; GFX1250-NEXT:    s_endpgm
@@ -272,6 +294,7 @@ entry:
 define amdgpu_ps void @cluster_load_async_to_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 inreg %mask, i32 %idx) {
 ; GFX1250-LABEL: cluster_load_async_to_lds_b64_saddr_no_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1250-NEXT:    s_mov_b32 m0, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll
index 7746dc60ddfc3..27efce3d35089 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.load.ll
@@ -9,6 +9,7 @@ declare <4 x i32> @llvm.amdgcn.cluster.load.b128.v4i32.p1(ptr addrspace(1), i32
 define amdgpu_ps void @cluster_load_b32_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use, i32 %mask) {
 ; GFX1250-LABEL: cluster_load_b32_vaddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX1250-NEXT:    s_mov_b32 m0, s0
 ; GFX1250-NEXT:    cluster_load_b32 v0, v[0:1], off offset:32 th:TH_LOAD_NT
@@ -25,6 +26,7 @@ entry:
 define amdgpu_ps void @cluster_load_b32_vaddr_imm_mask(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: cluster_load_b32_vaddr_imm_mask:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 m0, 7
 ; GFX1250-NEXT:    cluster_load_b32 v0, v[0:1], off offset:32 th:TH_LOAD_HT scope:SCOPE_SE
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -40,6 +42,7 @@ entry:
 define amdgpu_ps void @cluster_load_b32_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask) {
 ; GFX1250-LABEL: cluster_load_b32_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_mov_b32 m0, s2
 ; GFX1250-NEXT:    cluster_load_b32 v2, v2, s[0:1] offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV
@@ -56,6 +59,7 @@ entry:
 define amdgpu_ps void @cluster_load_monitor_b32_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask, i32 %idx) {
 ; GFX1250-LABEL: cluster_load_monitor_b32_saddr_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 m0, s2
 ; GFX1250-NEXT:    cluster_load_b32 v2, v2, s[0:1] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -72,6 +76,7 @@ entry:
 define amdgpu_ps void @cluster_load_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use, i32 %mask) {
 ; GFX1250-LABEL: cluster_load_b64_vaddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX1250-NEXT:    s_mov_b32 m0, s0
 ; GFX1250-NEXT:    cluster_load_b64 v[0:1], v[0:1], off offset:32 th:TH_LOAD_NT
@@ -88,6 +93,7 @@ entry:
 define amdgpu_ps void @cluster_load_b64_vaddr_imm_mask(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: cluster_load_b64_vaddr_imm_mask:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 m0, 0x10007
 ; GFX1250-NEXT:    cluster_load_b64 v[0:1], v[0:1], off offset:32 th:TH_LOAD_HT scope:SCOPE_SE
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -103,6 +109,7 @@ entry:
 define amdgpu_ps void @cluster_load_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask) {
 ; GFX1250-LABEL: cluster_load_b64_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_mov_b32 m0, s2
 ; GFX1250-NEXT:    cluster_load_b64 v[2:3], v2, s[0:1] offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV
@@ -119,6 +126,7 @@ entry:
 define amdgpu_ps void @cluster_load_monitor_b64_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask, i32 %idx) {
 ; GFX1250-LABEL: cluster_load_monitor_b64_saddr_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 m0, s2
 ; GFX1250-NEXT:    cluster_load_b64 v[2:3], v2, s[0:1] scale_offset th:TH_LOAD_BYPASS scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -135,6 +143,7 @@ entry:
 define amdgpu_ps void @cluster_load_b128_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use, i32 %mask) {
 ; GFX1250-LABEL: cluster_load_b128_vaddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX1250-NEXT:    s_mov_b32 m0, s0
 ; GFX1250-NEXT:    cluster_load_b128 v[4:7], v[0:1], off offset:32 th:TH_LOAD_NT
@@ -151,6 +160,7 @@ entry:
 define amdgpu_ps void @cluster_load_b128_vaddr_imm_mask(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: cluster_load_b128_vaddr_imm_mask:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 m0, 15
 ; GFX1250-NEXT:    cluster_load_b128 v[4:7], v[0:1], off offset:32 th:TH_LOAD_HT scope:SCOPE_SE
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -166,6 +176,7 @@ entry:
 define amdgpu_ps void @cluster_load_b128_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 inreg %mask) {
 ; GFX1250-LABEL: cluster_load_b128_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_mov_b32 m0, s2
 ; GFX1250-NEXT:    cluster_load_b128 v[2:5], v2, s[0:1] offset:32 th:TH_LOAD_BYPASS scope:SCOPE_SYS

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll
index 3ef84a3943d14..b16fa1e5f5831 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.id.ll
@@ -11,6 +11,7 @@ declare i32 @llvm.amdgcn.cluster.workgroup.id.z() #0
 define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 {
 ; CHECK-UNKNOWN-LABEL: test_workgroup_id_x:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-UNKNOWN-NEXT:    s_and_b32 s2, ttmp6, 15
 ; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -89,6 +90,7 @@ define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 {
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-MESA3D-NEXT:    s_and_b32 s2, ttmp6, 15
 ; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -99,6 +101,7 @@ define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 {
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_x:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    s_and_b32 s2, ttmp6, 15
 ; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -177,6 +180,7 @@ define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 {
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-G-MESA3D-NEXT:    s_and_b32 s2, ttmp6, 15
 ; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -192,6 +196,7 @@ define amdgpu_kernel void @test_workgroup_id_x(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="1,2,2" {
 ; CHECK-UNKNOWN-LABEL: test_workgroup_id_x_optimized:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
@@ -268,6 +273,7 @@ define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out)
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
@@ -276,6 +282,7 @@ define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out)
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_x_optimized:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
@@ -352,6 +359,7 @@ define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out)
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-G-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
@@ -365,6 +373,7 @@ define amdgpu_kernel void @test_workgroup_id_x_optimized(ptr addrspace(1) %out)
 define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 {
 ; CHECK-UNKNOWN-LABEL: test_workgroup_id_y:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
 ; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -443,6 +452,7 @@ define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 {
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
 ; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -453,6 +463,7 @@ define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 {
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_y:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
 ; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -531,6 +542,7 @@ define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 {
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
 ; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -546,6 +558,7 @@ define amdgpu_kernel void @test_workgroup_id_y(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="2,1,2" {
 ; CHECK-UNKNOWN-LABEL: test_workgroup_id_y_optimized:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
@@ -622,6 +635,7 @@ define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out)
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
@@ -630,6 +644,7 @@ define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out)
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_y_optimized:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
@@ -706,6 +721,7 @@ define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out)
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-G-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
@@ -719,6 +735,7 @@ define amdgpu_kernel void @test_workgroup_id_y_optimized(ptr addrspace(1) %out)
 define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 {
 ; CHECK-UNKNOWN-LABEL: test_workgroup_id_z:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
 ; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -797,6 +814,7 @@ define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 {
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
 ; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -807,6 +825,7 @@ define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 {
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_z:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
 ; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -885,6 +904,7 @@ define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 {
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40008
 ; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -900,6 +920,7 @@ define amdgpu_kernel void @test_workgroup_id_z(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) {
 ; CHECK-UNKNOWN-LABEL: test_workgroup_flat_id:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-UNKNOWN-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 21, 4)
 ; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -978,6 +999,7 @@ define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) {
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-MESA3D-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 21, 4)
 ; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -988,6 +1010,7 @@ define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) {
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_workgroup_flat_id:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 21, 4)
 ; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1066,6 +1089,7 @@ define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) {
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-G-MESA3D-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 21, 4)
 ; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1081,6 +1105,7 @@ define amdgpu_kernel void @test_workgroup_flat_id(ptr addrspace(1) %out) {
 define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="2,2,1" {
 ; CHECK-UNKNOWN-LABEL: test_workgroup_id_z_optimized:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
@@ -1157,6 +1182,7 @@ define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out)
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
@@ -1165,6 +1191,7 @@ define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out)
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_workgroup_id_z_optimized:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
@@ -1241,6 +1268,7 @@ define amdgpu_kernel void @test_workgroup_id_z_optimized(ptr addrspace(1) %out)
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-G-MESA3D-NEXT:    v_mov_b32_e32 v0, 0
 ; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll
index b8ff9e5ae0366..25e4b9887ffe3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.flat.id.ll
@@ -9,6 +9,7 @@ declare i32 @llvm.amdgcn.cluster.workgroup.max.flat.id() #0
 define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1 {
 ; CHECK-UNKNOWN-LABEL: test_workgroup_max_flat_id:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40018
 ; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -87,6 +88,7 @@ define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40018
 ; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -97,6 +99,7 @@ define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_flat_id:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40018
 ; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -175,6 +178,7 @@ define amdgpu_kernel void @test_workgroup_max_flat_id(ptr addrspace(1) %out) #1
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40018
 ; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll
index 9bca696b73437..bc44168960c74 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cluster.workgroup.max.id.ll
@@ -11,6 +11,7 @@ declare i32 @llvm.amdgcn.cluster.workgroup.max.id.z() #0
 define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 {
 ; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_x:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
 ; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -89,6 +90,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 {
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
 ; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -99,6 +101,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 {
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_x:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
 ; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -177,6 +180,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 {
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
 ; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -192,6 +196,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="5,6,7" {
 ; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_x_optimized:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4
 ; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
@@ -268,6 +273,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %o
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 4
 ; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
@@ -276,6 +282,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %o
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_x_optimized:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 0
 ; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
@@ -352,6 +359,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %o
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v0, 4 :: v_dual_mov_b32 v1, 0
 ; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
@@ -365,6 +373,7 @@ define amdgpu_kernel void @test_workgroup_max_id_x_optimized(ptr addrspace(1) %o
 define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 {
 ; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_y:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40010
 ; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -443,6 +452,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 {
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40010
 ; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -453,6 +463,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 {
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_y:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40010
 ; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -531,6 +542,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 {
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40010
 ; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -546,6 +558,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="5,6,7" {
 ; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_y_optimized:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
 ; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
@@ -622,6 +635,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %o
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 5
 ; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
@@ -630,6 +644,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %o
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_y_optimized:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 5 :: v_dual_mov_b32 v1, 0
 ; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
@@ -706,6 +721,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %o
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v0, 5 :: v_dual_mov_b32 v1, 0
 ; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0
@@ -719,6 +735,7 @@ define amdgpu_kernel void @test_workgroup_max_id_y_optimized(ptr addrspace(1) %o
 define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 {
 ; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_z:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40014
 ; CHECK-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -797,6 +814,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 {
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40014
 ; CHECK-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -807,6 +825,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 {
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_z:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    s_bfe_u32 s2, ttmp6, 0x40014
 ; CHECK-G-UNKNOWN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -885,6 +904,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 {
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-G-MESA3D-NEXT:    s_bfe_u32 s2, ttmp6, 0x40014
 ; CHECK-G-MESA3D-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -900,6 +920,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %out) "amdgpu-cluster-dims"="5,6,7" {
 ; CHECK-UNKNOWN-LABEL: test_workgroup_max_id_z_optimized:
 ; CHECK-UNKNOWN:       ; %bb.0:
+; CHECK-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 6
 ; CHECK-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
@@ -976,6 +997,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %o
 ; CHECK-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-MESA3D-NEXT:  ; %bb.0:
+; CHECK-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-MESA3D-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 6
 ; CHECK-MESA3D-NEXT:    s_wait_kmcnt 0x0
@@ -984,6 +1006,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %o
 ;
 ; CHECK-G-UNKNOWN-LABEL: test_workgroup_max_id_z_optimized:
 ; CHECK-G-UNKNOWN:       ; %bb.0:
+; CHECK-G-UNKNOWN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-UNKNOWN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; CHECK-G-UNKNOWN-NEXT:    v_dual_mov_b32 v0, 6 :: v_dual_mov_b32 v1, 0
 ; CHECK-G-UNKNOWN-NEXT:    s_wait_kmcnt 0x0
@@ -1060,6 +1083,7 @@ define amdgpu_kernel void @test_workgroup_max_id_z_optimized(ptr addrspace(1) %o
 ; CHECK-G-MESA3D-NEXT:     runtime_loader_kernel_symbol = 0
 ; CHECK-G-MESA3D-NEXT:    .end_amd_kernel_code_t
 ; CHECK-G-MESA3D-NEXT:  ; %bb.0:
+; CHECK-G-MESA3D-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; CHECK-G-MESA3D-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; CHECK-G-MESA3D-NEXT:    v_dual_mov_b32 v0, 6 :: v_dual_mov_b32 v1, 0
 ; CHECK-G-MESA3D-NEXT:    s_wait_kmcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll
index 95653148b09f4..72611843aff95 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.f16.fp8.ll
@@ -12,6 +12,7 @@ declare <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16)
 define amdgpu_ps float @test_cvt_f16_bf8_byte0(i32 %a) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte0:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f16_bf8_e32 v0.l, v0
 ; GFX1250-SDAG-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -19,6 +20,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte0(i32 %a) {
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte0:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_bf8_e32 v0, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -26,6 +28,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte0(i32 %a) {
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte0:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f16_bf8_e32 v0.l, v0
 ; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -33,6 +36,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte0(i32 %a) {
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte0:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_bf8_e32 v0, v0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -45,6 +49,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte0(i32 %a) {
 define amdgpu_ps float @test_cvt_f16_bf8_byte1(i32 %a) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte1:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:1
 ; GFX1250-SDAG-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -52,6 +57,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte1(i32 %a) {
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte1:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_bf8_e64 v0, v0 byte_sel:1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -59,6 +65,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte1(i32 %a) {
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte1:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:1
 ; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -66,6 +73,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte1(i32 %a) {
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte1:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_bf8_e64 v0, v0 byte_sel:1
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -78,6 +86,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte1(i32 %a) {
 define amdgpu_ps float @test_cvt_f16_bf8_byte2(i32 %a) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte2:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:2
 ; GFX1250-SDAG-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -85,6 +94,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte2(i32 %a) {
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte2:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_bf8_e64 v0, v0 byte_sel:2
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -92,6 +102,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte2(i32 %a) {
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte2:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:2
 ; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -99,6 +110,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte2(i32 %a) {
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte2:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_bf8_e64 v0, v0 byte_sel:2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -111,6 +123,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte2(i32 %a) {
 define amdgpu_ps float @test_cvt_f16_bf8_byte3(i32 %a) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte3:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3
 ; GFX1250-SDAG-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -118,6 +131,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte3(i32 %a) {
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte3:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_bf8_e64 v0, v0 byte_sel:3
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -125,6 +139,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte3(i32 %a) {
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte3:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3
 ; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -132,6 +147,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte3(i32 %a) {
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte3:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_bf8_e64 v0, v0 byte_sel:3
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -144,12 +160,14 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte3(i32 %a) {
 define amdgpu_ps float @test_cvt_f16_bf8_byte3_hi(i32 %a) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_bf8_byte3_hi:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f16_bf8_e64 v0.h, v0 byte_sel:3
 ; GFX1250-SDAG-REAL16-NEXT:    v_mov_b16_e32 v0.l, 0
 ; GFX1250-SDAG-REAL16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_bf8_byte3_hi:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_bf8_e64 v0, v0 byte_sel:3
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, 0, 0x5040100
@@ -157,6 +175,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte3_hi(i32 %a) {
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_bf8_byte3_hi:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f16_bf8_e64 v0.l, v0 byte_sel:3
 ; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-REAL16-NEXT:    v_lshl_or_b32 v0, v0, 16, 0
@@ -164,6 +183,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte3_hi(i32 %a) {
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_bf8_byte3_hi:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_bf8_e64 v0, v0 byte_sel:3
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v0, 16, 0
@@ -178,6 +198,7 @@ define amdgpu_ps float @test_cvt_f16_bf8_byte3_hi(i32 %a) {
 define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte0:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f16_fp8_e32 v0.l, v0
 ; GFX1250-SDAG-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -185,6 +206,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) {
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte0:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_fp8_e32 v0, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -192,6 +214,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) {
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte0:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f16_fp8_e32 v0.l, v0
 ; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -199,6 +222,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) {
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte0:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_fp8_e32 v0, v0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -211,6 +235,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte0(i32 %a) {
 define amdgpu_ps float @test_cvt_f16_fp8_byte1(i32 %a) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte1:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:1
 ; GFX1250-SDAG-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -218,6 +243,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte1(i32 %a) {
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte1:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_fp8_e64 v0, v0 byte_sel:1
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -225,6 +251,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte1(i32 %a) {
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte1:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:1
 ; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -232,6 +259,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte1(i32 %a) {
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte1:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_fp8_e64 v0, v0 byte_sel:1
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -244,6 +272,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte1(i32 %a) {
 define amdgpu_ps float @test_cvt_f16_fp8_byte2(i32 %a) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte2:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:2
 ; GFX1250-SDAG-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -251,6 +280,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte2(i32 %a) {
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte2:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_fp8_e64 v0, v0 byte_sel:2
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -258,6 +288,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte2(i32 %a) {
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte2:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:2
 ; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -265,6 +296,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte2(i32 %a) {
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte2:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_fp8_e64 v0, v0 byte_sel:2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -277,6 +309,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte2(i32 %a) {
 define amdgpu_ps float @test_cvt_f16_fp8_byte3(i32 %a) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte3:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:3
 ; GFX1250-SDAG-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -284,6 +317,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte3(i32 %a) {
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte3:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_fp8_e64 v0, v0 byte_sel:3
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -291,6 +325,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte3(i32 %a) {
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte3:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:3
 ; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
@@ -298,6 +333,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte3(i32 %a) {
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte3:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_fp8_e64 v0, v0 byte_sel:3
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
@@ -310,12 +346,14 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte3(i32 %a) {
 define amdgpu_ps float @test_cvt_f16_fp8_byte3_hi(i32 %a) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_f16_fp8_byte3_hi:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_f16_fp8_e64 v0.h, v0 byte_sel:3
 ; GFX1250-SDAG-REAL16-NEXT:    v_mov_b16_e32 v0.l, 0
 ; GFX1250-SDAG-REAL16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_f16_fp8_byte3_hi:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_f16_fp8_e64 v0, v0 byte_sel:3
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_perm_b32 v0, v0, 0, 0x5040100
@@ -323,6 +361,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte3_hi(i32 %a) {
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_f16_fp8_byte3_hi:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_f16_fp8_e64 v0.l, v0 byte_sel:3
 ; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-REAL16-NEXT:    v_lshl_or_b32 v0, v0, 16, 0
@@ -330,6 +369,7 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte3_hi(i32 %a) {
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_f16_fp8_byte3_hi:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_f16_fp8_e64 v0, v0 byte_sel:3
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_lshl_or_b32 v0, v0, 16, 0
@@ -344,21 +384,25 @@ define amdgpu_ps float @test_cvt_f16_fp8_byte3_hi(i32 %a) {
 define amdgpu_ps float @test_cvt_pk_f16_bf8_v(i16 %a) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_f16_bf8_v:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_pk_f16_bf8 v0, v0.l
 ; GFX1250-SDAG-REAL16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_f16_bf8_v:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_pk_f16_bf8 v0, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_f16_bf8_v:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_pk_f16_bf8 v0, v0.l
 ; GFX1250-GISEL-REAL16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_f16_bf8_v:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_pk_f16_bf8 v0, v0
 ; GFX1250-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %cvt = tail call <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16 %a)
@@ -369,6 +413,7 @@ define amdgpu_ps float @test_cvt_pk_f16_bf8_v(i16 %a) {
 define amdgpu_ps float @test_cvt_pk_f16_bf8_s(i16 inreg %a) {
 ; GFX1250-LABEL: test_cvt_pk_f16_bf8_s:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_pk_f16_bf8 v0, s0
 ; GFX1250-NEXT:    ; return to shader part epilog
   %cvt = tail call <2 x half> @llvm.amdgcn.cvt.pk.f16.bf8(i16 %a)
@@ -379,21 +424,25 @@ define amdgpu_ps float @test_cvt_pk_f16_bf8_s(i16 inreg %a) {
 define amdgpu_ps float @test_cvt_pk_f16_fp8_v(i16 %a) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_f16_fp8_v:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_pk_f16_fp8 v0, v0.l
 ; GFX1250-SDAG-REAL16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_f16_fp8_v:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_pk_f16_fp8 v0, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_f16_fp8_v:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_pk_f16_fp8 v0, v0.l
 ; GFX1250-GISEL-REAL16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_f16_fp8_v:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_pk_f16_fp8 v0, v0
 ; GFX1250-GISEL-FAKE16-NEXT:    ; return to shader part epilog
   %cvt = tail call <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16 %a)
@@ -404,6 +453,7 @@ define amdgpu_ps float @test_cvt_pk_f16_fp8_v(i16 %a) {
 define amdgpu_ps float @test_cvt_pk_f16_fp8_s(i16 inreg %a) {
 ; GFX1250-LABEL: test_cvt_pk_f16_fp8_s:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_pk_f16_fp8 v0, s0
 ; GFX1250-NEXT:    ; return to shader part epilog
   %cvt = tail call <2 x half> @llvm.amdgcn.cvt.pk.f16.fp8(i16 %a)
@@ -414,11 +464,13 @@ define amdgpu_ps float @test_cvt_pk_f16_fp8_s(i16 inreg %a) {
 define amdgpu_ps float @test_cvt_pk_f16_fp8_v_hi(<2 x i16> %a) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_f16_fp8_v_hi:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_pk_f16_fp8 v0, v0.h
 ; GFX1250-SDAG-REAL16-NEXT:    ; return to shader part epilog
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_f16_fp8_v_hi:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_pk_f16_fp8 v0, v0
@@ -426,6 +478,7 @@ define amdgpu_ps float @test_cvt_pk_f16_fp8_v_hi(<2 x i16> %a) {
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_f16_fp8_v_hi:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_pk_f16_fp8 v0, v0.l
@@ -433,6 +486,7 @@ define amdgpu_ps float @test_cvt_pk_f16_fp8_v_hi(<2 x i16> %a) {
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_f16_fp8_v_hi:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_pk_f16_fp8 v0, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
index 856290ab2e868..3372868455d65 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
@@ -10,6 +10,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) {
 ;
 ; GFX1250-LABEL: test_cvt_f32_bf8_byte0:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_f32_bf8_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1250-NEXT:    ; return to shader part epilog
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
@@ -25,6 +26,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) {
 ;
 ; GFX1250-LABEL: test_cvt_f32_bf8_byte1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1250-NEXT:    ; return to shader part epilog
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
@@ -40,6 +42,7 @@ define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) {
 ;
 ; GFX1250-LABEL: test_cvt_f32_bf8_byte2:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_f32_bf8_e64_dpp v0, v0 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1250-NEXT:    ; return to shader part epilog
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
@@ -55,6 +58,7 @@ define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) {
 ;
 ; GFX1250-LABEL: test_cvt_f32_fp8_byte3:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_f32_fp8_e64_dpp v0, v0 byte_sel:3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1250-NEXT:    ; return to shader part epilog
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
@@ -71,6 +75,7 @@ define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr
 ;
 ; GFX1250-LABEL: test_cvt_pk_bf8_f32_word0:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-NEXT:    v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1250-NEXT:    global_store_b32 v[4:5], v2, off
@@ -93,6 +98,7 @@ define amdgpu_cs void @test_cvt_pk_fp8_f32_word1(i32 %a, float %y, i32 %old, ptr
 ;
 ; GFX1250-LABEL: test_cvt_pk_fp8_f32_word1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -115,6 +121,7 @@ define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr a
 ;
 ; GFX1250-LABEL: test_cvt_sr_bf8_f32_byte0:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-NEXT:    v_cvt_sr_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1250-NEXT:    global_store_b32 v[4:5], v2, off
@@ -135,6 +142,7 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr a
 ;
 ; GFX1250-LABEL: test_cvt_sr_fp8_f32_byte1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-NEXT:    v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1250-NEXT:    global_store_b32 v[4:5], v2, off
@@ -155,6 +163,7 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr a
 ;
 ; GFX1250-LABEL: test_cvt_sr_fp8_f32_byte2:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-NEXT:    v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1250-NEXT:    global_store_b32 v[4:5], v2, off

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll
index fd51759f50d48..0dd0f7648c94a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.e5m3.ll
@@ -72,6 +72,7 @@ define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) {
 define amdgpu_cs void @test_cvt_pk_fp8_f32_word1_dpp(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) {
 ; GFX1250-TRUE16-LABEL: test_cvt_pk_fp8_f32_word1_dpp:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1250-TRUE16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -81,6 +82,7 @@ define amdgpu_cs void @test_cvt_pk_fp8_f32_word1_dpp(i32 %a, float %y, i32 %old,
 ;
 ; GFX1250-FAKE16-LABEL: test_cvt_pk_fp8_f32_word1_dpp:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1250-FAKE16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -90,6 +92,7 @@ define amdgpu_cs void @test_cvt_pk_fp8_f32_word1_dpp(i32 %a, float %y, i32 %old,
 ;
 ; GFX1250-GISEL-LABEL: test_cvt_pk_fp8_f32_word1_dpp:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -158,6 +161,7 @@ define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) {
 define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1_dpp(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
 ; GFX1250-TRUE16-LABEL: test_cvt_sr_fp8_f32_byte1_dpp:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-TRUE16-NEXT:    v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 clamp quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1250-TRUE16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -165,6 +169,7 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1_dpp(i32 %a, i32 %r, i32 %old, p
 ;
 ; GFX1250-FAKE16-LABEL: test_cvt_sr_fp8_f32_byte1_dpp:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-FAKE16-NEXT:    v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 clamp quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1250-FAKE16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -172,6 +177,7 @@ define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1_dpp(i32 %a, i32 %r, i32 %old, p
 ;
 ; GFX1250-GISEL-LABEL: test_cvt_sr_fp8_f32_byte1_dpp:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-NEXT:    v_cvt_sr_fp8_f32_e64_dpp v2, v0, v1 byte_sel:1 clamp quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
 ; GFX1250-GISEL-NEXT:    global_store_b32 v[6:7], v2, off

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.f16.ll
index ff4756318b2dc..dea104ee0b42a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.f16.ll
@@ -12,6 +12,7 @@ declare i32 @llvm.amdgcn.cvt.sr.fp8.f16(half, i32, i32, i32)
 define amdgpu_ps void @test_cvt_pk_bf8_f16_v(<2 x half> %a, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_bf8_f16_v:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_pk_bf8_f16 v0.l, v0
 ; GFX1250-SDAG-REAL16-NEXT:    global_store_b16 v[2:3], v0, off
@@ -19,6 +20,7 @@ define amdgpu_ps void @test_cvt_pk_bf8_f16_v(<2 x half> %a, ptr addrspace(1) %ou
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_bf8_f16_v:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_pk_bf8_f16 v0, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    global_store_b16 v[2:3], v0, off
@@ -26,6 +28,7 @@ define amdgpu_ps void @test_cvt_pk_bf8_f16_v(<2 x half> %a, ptr addrspace(1) %ou
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_bf8_f16_v:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_pk_bf8_f16 v0.l, v0
 ; GFX1250-GISEL-REAL16-NEXT:    global_store_b16 v[4:5], v0, off
@@ -33,6 +36,7 @@ define amdgpu_ps void @test_cvt_pk_bf8_f16_v(<2 x half> %a, ptr addrspace(1) %ou
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_bf8_f16_v:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_pk_bf8_f16 v0, v0
 ; GFX1250-GISEL-FAKE16-NEXT:    global_store_b16 v[4:5], v0, off
@@ -45,24 +49,28 @@ define amdgpu_ps void @test_cvt_pk_bf8_f16_v(<2 x half> %a, ptr addrspace(1) %ou
 define amdgpu_ps void @test_cvt_pk_bf8_f16_s(<2 x half> inreg %a, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_bf8_f16_s:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_pk_bf8_f16 v2.l, s0
 ; GFX1250-SDAG-REAL16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX1250-SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_bf8_f16_s:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_pk_bf8_f16 v2, s0
 ; GFX1250-SDAG-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX1250-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_bf8_f16_s:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_pk_bf8_f16 v2.l, s0
 ; GFX1250-GISEL-REAL16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX1250-GISEL-REAL16-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_bf8_f16_s:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_pk_bf8_f16 v2, s0
 ; GFX1250-GISEL-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX1250-GISEL-FAKE16-NEXT:    s_endpgm
@@ -74,24 +82,28 @@ define amdgpu_ps void @test_cvt_pk_bf8_f16_s(<2 x half> inreg %a, ptr addrspace(
 define amdgpu_ps void @test_cvt_pk_bf8_f16_l(ptr addrspace(1) %out) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_bf8_f16_l:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_pk_bf8_f16 v2.l, 0x56400000
 ; GFX1250-SDAG-REAL16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX1250-SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_bf8_f16_l:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_pk_bf8_f16 v2, 0x56400000
 ; GFX1250-SDAG-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX1250-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_bf8_f16_l:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_pk_bf8_f16 v2.l, 0x56400000
 ; GFX1250-GISEL-REAL16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX1250-GISEL-REAL16-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_bf8_f16_l:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_pk_bf8_f16 v2, 0x56400000
 ; GFX1250-GISEL-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX1250-GISEL-FAKE16-NEXT:    s_endpgm
@@ -103,6 +115,7 @@ define amdgpu_ps void @test_cvt_pk_bf8_f16_l(ptr addrspace(1) %out) {
 define amdgpu_ps void @test_cvt_pk_fp8_f16_v(<2 x half> %a, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_fp8_f16_v:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_pk_fp8_f16 v0.l, v0
 ; GFX1250-SDAG-REAL16-NEXT:    global_store_b16 v[2:3], v0, off
@@ -110,6 +123,7 @@ define amdgpu_ps void @test_cvt_pk_fp8_f16_v(<2 x half> %a, ptr addrspace(1) %ou
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_fp8_f16_v:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_pk_fp8_f16 v0, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    global_store_b16 v[2:3], v0, off
@@ -117,6 +131,7 @@ define amdgpu_ps void @test_cvt_pk_fp8_f16_v(<2 x half> %a, ptr addrspace(1) %ou
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_fp8_f16_v:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_pk_fp8_f16 v0.l, v0
 ; GFX1250-GISEL-REAL16-NEXT:    global_store_b16 v[4:5], v0, off
@@ -124,6 +139,7 @@ define amdgpu_ps void @test_cvt_pk_fp8_f16_v(<2 x half> %a, ptr addrspace(1) %ou
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_fp8_f16_v:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_pk_fp8_f16 v0, v0
 ; GFX1250-GISEL-FAKE16-NEXT:    global_store_b16 v[4:5], v0, off
@@ -136,24 +152,28 @@ define amdgpu_ps void @test_cvt_pk_fp8_f16_v(<2 x half> %a, ptr addrspace(1) %ou
 define amdgpu_ps void @test_cvt_pk_fp8_f16_s(<2 x half> inreg %a, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_fp8_f16_s:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_pk_fp8_f16 v2.l, s0
 ; GFX1250-SDAG-REAL16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX1250-SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_fp8_f16_s:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_pk_fp8_f16 v2, s0
 ; GFX1250-SDAG-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX1250-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_fp8_f16_s:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_pk_fp8_f16 v2.l, s0
 ; GFX1250-GISEL-REAL16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX1250-GISEL-REAL16-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_fp8_f16_s:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_pk_fp8_f16 v2, s0
 ; GFX1250-GISEL-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX1250-GISEL-FAKE16-NEXT:    s_endpgm
@@ -165,24 +185,28 @@ define amdgpu_ps void @test_cvt_pk_fp8_f16_s(<2 x half> inreg %a, ptr addrspace(
 define amdgpu_ps void @test_cvt_pk_fp8_f16_l(ptr addrspace(1) %out) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_pk_fp8_f16_l:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_pk_fp8_f16 v2.l, 0x56400000
 ; GFX1250-SDAG-REAL16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX1250-SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_pk_fp8_f16_l:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_pk_fp8_f16 v2, 0x56400000
 ; GFX1250-SDAG-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX1250-SDAG-FAKE16-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_pk_fp8_f16_l:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_pk_fp8_f16 v2.l, 0x56400000
 ; GFX1250-GISEL-REAL16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX1250-GISEL-REAL16-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_pk_fp8_f16_l:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_pk_fp8_f16 v2, 0x56400000
 ; GFX1250-GISEL-FAKE16-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX1250-GISEL-FAKE16-NEXT:    s_endpgm
@@ -194,6 +218,7 @@ define amdgpu_ps void @test_cvt_pk_fp8_f16_l(ptr addrspace(1) %out) {
 define amdgpu_ps void @test_cvt_sr_bf8_f16_byte0(half %a, i32 %sr, i32 %old, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_bf8_f16_byte0:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_sr_bf8_f16 v2, v0.l, v1
 ; GFX1250-SDAG-REAL16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -201,6 +226,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte0(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_bf8_f16_byte0:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_sr_bf8_f16 v2, v0, v1
 ; GFX1250-SDAG-FAKE16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -208,6 +234,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte0(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_bf8_f16_byte0:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_sr_bf8_f16 v2, v0.l, v1
 ; GFX1250-GISEL-REAL16-NEXT:    global_store_b32 v[6:7], v2, off
@@ -215,6 +242,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte0(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_bf8_f16_byte0:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_sr_bf8_f16 v2, v0, v1
 ; GFX1250-GISEL-FAKE16-NEXT:    global_store_b32 v[6:7], v2, off
@@ -227,6 +255,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte0(half %a, i32 %sr, i32 %old, ptr
 define amdgpu_ps void @test_cvt_sr_bf8_f16_byte1(half %a, i32 %sr, i32 %old, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_bf8_f16_byte1:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_sr_bf8_f16 v2, v0.l, v1 byte_sel:1
 ; GFX1250-SDAG-REAL16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -234,6 +263,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte1(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_bf8_f16_byte1:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_sr_bf8_f16 v2, v0, v1 byte_sel:1
 ; GFX1250-SDAG-FAKE16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -241,6 +271,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte1(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_bf8_f16_byte1:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_sr_bf8_f16 v2, v0.l, v1 byte_sel:1
 ; GFX1250-GISEL-REAL16-NEXT:    global_store_b32 v[6:7], v2, off
@@ -248,6 +279,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte1(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_bf8_f16_byte1:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_sr_bf8_f16 v2, v0, v1 byte_sel:1
 ; GFX1250-GISEL-FAKE16-NEXT:    global_store_b32 v[6:7], v2, off
@@ -260,6 +292,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte1(half %a, i32 %sr, i32 %old, ptr
 define amdgpu_ps void @test_cvt_sr_bf8_f16_byte2(half %a, i32 %sr, i32 %old, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_bf8_f16_byte2:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_sr_bf8_f16 v2, v0.l, v1 byte_sel:2
 ; GFX1250-SDAG-REAL16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -267,6 +300,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte2(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_bf8_f16_byte2:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_sr_bf8_f16 v2, v0, v1 byte_sel:2
 ; GFX1250-SDAG-FAKE16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -274,6 +308,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte2(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_bf8_f16_byte2:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_sr_bf8_f16 v2, v0.l, v1 byte_sel:2
 ; GFX1250-GISEL-REAL16-NEXT:    global_store_b32 v[6:7], v2, off
@@ -281,6 +316,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte2(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_bf8_f16_byte2:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_sr_bf8_f16 v2, v0, v1 byte_sel:2
 ; GFX1250-GISEL-FAKE16-NEXT:    global_store_b32 v[6:7], v2, off
@@ -293,6 +329,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte2(half %a, i32 %sr, i32 %old, ptr
 define amdgpu_ps void @test_cvt_sr_bf8_f16_byte3(half %a, i32 %sr, i32 %old, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_bf8_f16_byte3:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_sr_bf8_f16 v2, v0.l, v1 byte_sel:3
 ; GFX1250-SDAG-REAL16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -300,6 +337,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte3(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_bf8_f16_byte3:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_sr_bf8_f16 v2, v0, v1 byte_sel:3
 ; GFX1250-SDAG-FAKE16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -307,6 +345,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte3(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_bf8_f16_byte3:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_sr_bf8_f16 v2, v0.l, v1 byte_sel:3
 ; GFX1250-GISEL-REAL16-NEXT:    global_store_b32 v[6:7], v2, off
@@ -314,6 +353,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte3(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_bf8_f16_byte3:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_sr_bf8_f16 v2, v0, v1 byte_sel:3
 ; GFX1250-GISEL-FAKE16-NEXT:    global_store_b32 v[6:7], v2, off
@@ -326,6 +366,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_byte3(half %a, i32 %sr, i32 %old, ptr
 define amdgpu_ps void @test_cvt_sr_bf8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32 %old, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_bf8_f16_hi_byte0:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_sr_bf8_f16 v2, v0.h, v1
 ; GFX1250-SDAG-REAL16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -333,6 +374,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_bf8_f16_hi_byte0:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_lshrrev_b32 v0, 16, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -342,6 +384,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_bf8_f16_hi_byte0:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_dual_lshrrev_b32 v0, 16, v0 :: v_dual_mov_b32 v6, v3
 ; GFX1250-GISEL-REAL16-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -351,6 +394,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_bf8_f16_hi_byte0:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_dual_lshrrev_b32 v0, 16, v0 :: v_dual_mov_b32 v6, v3
 ; GFX1250-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -366,6 +410,7 @@ define amdgpu_ps void @test_cvt_sr_bf8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32
 define amdgpu_ps void @test_cvt_sr_fp8_f16_byte0(half %a, i32 %sr, i32 %old, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_fp8_f16_byte0:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_sr_fp8_f16 v2, v0.l, v1
 ; GFX1250-SDAG-REAL16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -373,6 +418,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte0(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_fp8_f16_byte0:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_sr_fp8_f16 v2, v0, v1
 ; GFX1250-SDAG-FAKE16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -380,6 +426,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte0(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_fp8_f16_byte0:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_sr_fp8_f16 v2, v0.l, v1
 ; GFX1250-GISEL-REAL16-NEXT:    global_store_b32 v[6:7], v2, off
@@ -387,6 +434,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte0(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_fp8_f16_byte0:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_sr_fp8_f16 v2, v0, v1
 ; GFX1250-GISEL-FAKE16-NEXT:    global_store_b32 v[6:7], v2, off
@@ -399,6 +447,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte0(half %a, i32 %sr, i32 %old, ptr
 define amdgpu_ps void @test_cvt_sr_fp8_f16_byte1(half %a, i32 %sr, i32 %old, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_fp8_f16_byte1:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_sr_fp8_f16 v2, v0.l, v1 byte_sel:1
 ; GFX1250-SDAG-REAL16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -406,6 +455,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte1(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_fp8_f16_byte1:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_sr_fp8_f16 v2, v0, v1 byte_sel:1
 ; GFX1250-SDAG-FAKE16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -413,6 +463,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte1(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_fp8_f16_byte1:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_sr_fp8_f16 v2, v0.l, v1 byte_sel:1
 ; GFX1250-GISEL-REAL16-NEXT:    global_store_b32 v[6:7], v2, off
@@ -420,6 +471,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte1(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_fp8_f16_byte1:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_sr_fp8_f16 v2, v0, v1 byte_sel:1
 ; GFX1250-GISEL-FAKE16-NEXT:    global_store_b32 v[6:7], v2, off
@@ -432,6 +484,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte1(half %a, i32 %sr, i32 %old, ptr
 define amdgpu_ps void @test_cvt_sr_fp8_f16_byte2(half %a, i32 %sr, i32 %old, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_fp8_f16_byte2:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_sr_fp8_f16 v2, v0.l, v1 byte_sel:2
 ; GFX1250-SDAG-REAL16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -439,6 +492,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte2(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_fp8_f16_byte2:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_sr_fp8_f16 v2, v0, v1 byte_sel:2
 ; GFX1250-SDAG-FAKE16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -446,6 +500,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte2(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_fp8_f16_byte2:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_sr_fp8_f16 v2, v0.l, v1 byte_sel:2
 ; GFX1250-GISEL-REAL16-NEXT:    global_store_b32 v[6:7], v2, off
@@ -453,6 +508,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte2(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_fp8_f16_byte2:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_sr_fp8_f16 v2, v0, v1 byte_sel:2
 ; GFX1250-GISEL-FAKE16-NEXT:    global_store_b32 v[6:7], v2, off
@@ -465,6 +521,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte2(half %a, i32 %sr, i32 %old, ptr
 define amdgpu_ps void @test_cvt_sr_fp8_f16_byte3(half %a, i32 %sr, i32 %old, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_fp8_f16_byte3:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_sr_fp8_f16 v2, v0.l, v1 byte_sel:3
 ; GFX1250-SDAG-REAL16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -472,6 +529,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte3(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_fp8_f16_byte3:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-FAKE16-NEXT:    v_cvt_sr_fp8_f16 v2, v0, v1 byte_sel:3
 ; GFX1250-SDAG-FAKE16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -479,6 +537,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte3(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_fp8_f16_byte3:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-REAL16-NEXT:    v_cvt_sr_fp8_f16 v2, v0.l, v1 byte_sel:3
 ; GFX1250-GISEL-REAL16-NEXT:    global_store_b32 v[6:7], v2, off
@@ -486,6 +545,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte3(half %a, i32 %sr, i32 %old, ptr
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_fp8_f16_byte3:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_dual_mov_b32 v6, v3 :: v_dual_mov_b32 v7, v4
 ; GFX1250-GISEL-FAKE16-NEXT:    v_cvt_sr_fp8_f16 v2, v0, v1 byte_sel:3
 ; GFX1250-GISEL-FAKE16-NEXT:    global_store_b32 v[6:7], v2, off
@@ -498,6 +558,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_byte3(half %a, i32 %sr, i32 %old, ptr
 define amdgpu_ps void @test_cvt_sr_fp8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32 %old, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-REAL16-LABEL: test_cvt_sr_fp8_f16_hi_byte0:
 ; GFX1250-SDAG-REAL16:       ; %bb.0:
+; GFX1250-SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-REAL16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
 ; GFX1250-SDAG-REAL16-NEXT:    v_cvt_sr_fp8_f16 v2, v0.h, v1
 ; GFX1250-SDAG-REAL16-NEXT:    global_store_b32 v[4:5], v2, off
@@ -505,6 +566,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32
 ;
 ; GFX1250-SDAG-FAKE16-LABEL: test_cvt_sr_fp8_f16_hi_byte0:
 ; GFX1250-SDAG-FAKE16:       ; %bb.0:
+; GFX1250-SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-FAKE16-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_lshrrev_b32 v0, 16, v0
 ; GFX1250-SDAG-FAKE16-NEXT:    v_mov_b32_e32 v4, v3
 ; GFX1250-SDAG-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -514,6 +576,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32
 ;
 ; GFX1250-GISEL-REAL16-LABEL: test_cvt_sr_fp8_f16_hi_byte0:
 ; GFX1250-GISEL-REAL16:       ; %bb.0:
+; GFX1250-GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-REAL16-NEXT:    v_dual_lshrrev_b32 v0, 16, v0 :: v_dual_mov_b32 v6, v3
 ; GFX1250-GISEL-REAL16-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX1250-GISEL-REAL16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -523,6 +586,7 @@ define amdgpu_ps void @test_cvt_sr_fp8_f16_hi_byte0(<2 x half> %a, i32 %sr, i32
 ;
 ; GFX1250-GISEL-FAKE16-LABEL: test_cvt_sr_fp8_f16_hi_byte0:
 ; GFX1250-GISEL-FAKE16:       ; %bb.0:
+; GFX1250-GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-FAKE16-NEXT:    v_dual_lshrrev_b32 v0, 16, v0 :: v_dual_mov_b32 v6, v3
 ; GFX1250-GISEL-FAKE16-NEXT:    v_mov_b32_e32 v7, v4
 ; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.f16.ll
index 2179800f9d317..d6ed869d28354 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pk.f16.ll
@@ -7,6 +7,7 @@ declare <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float, float, i32) #0
 define amdgpu_ps float @cvt_sr_pk_f16_f32_vvv(float %src0, float %src1, i32 %src2) #1 {
 ; GCN-LABEL: cvt_sr_pk_f16_f32_vvv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_cvt_sr_pk_f16_f32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
   %cvt = call <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float %src0, float %src1, i32 %src2) #0
@@ -17,6 +18,7 @@ define amdgpu_ps float @cvt_sr_pk_f16_f32_vvv(float %src0, float %src1, i32 %src
 define amdgpu_ps float @cvt_sr_pk_f16_f32_sss(float inreg %src0, float inreg %src1, i32 inreg %src2) #1 {
 ; GCN-LABEL: cvt_sr_pk_f16_f32_sss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    v_cvt_sr_pk_f16_f32 v0, s0, s1, v0
@@ -29,6 +31,7 @@ define amdgpu_ps float @cvt_sr_pk_f16_f32_sss(float inreg %src0, float inreg %sr
 define amdgpu_ps float @cvt_sr_pk_f16_f32_vvi(float %src0, float %src1) #1 {
 ; GCN-LABEL: cvt_sr_pk_f16_f32_vvi:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_cvt_sr_pk_f16_f32 v0, v0, v1, 0x10002
 ; GCN-NEXT:    ; return to shader part epilog
   %cvt = call <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float %src0, float %src1, i32 65538) #0
@@ -39,6 +42,7 @@ define amdgpu_ps float @cvt_sr_pk_f16_f32_vvi(float %src0, float %src1) #1 {
 define amdgpu_ps float @cvt_sr_pk_f16_f32_vvi_mods(float %src0, float %src1) #1 {
 ; GCN-LABEL: cvt_sr_pk_f16_f32_vvi_mods:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_cvt_sr_pk_f16_f32 v0, -v0, |v1|, 1
 ; GCN-NEXT:    ; return to shader part epilog
   %s0 = fneg float %src0
@@ -51,6 +55,7 @@ define amdgpu_ps float @cvt_sr_pk_f16_f32_vvi_mods(float %src0, float %src1) #1
 define amdgpu_ps float @cvt_sr_pk_f16_f32_ssi(float inreg %src0, float inreg %src1) #1 {
 ; GCN-LABEL: cvt_sr_pk_f16_f32_ssi:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_cvt_sr_pk_f16_f32 v0, s0, s1, 1
 ; GCN-NEXT:    ; return to shader part epilog
   %cvt = call <2 x half> @llvm.amdgcn.cvt.sr.pk.f16.f32(float %src0, float %src1, i32 1) #0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
index 559b1b171031d..05988c908cde7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sat.pk.ll
@@ -10,6 +10,7 @@ declare i16 @llvm.amdgcn.sat.pk4.u4.u8(i32) #0
 define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
 ; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_v:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_clause 0x1
 ; SDAG-REAL16-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -21,6 +22,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_v:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_clause 0x1
 ; SDAG-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -32,6 +34,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_v:
 ; GISEL-REAL16:       ; %bb.0:
+; GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-REAL16-NEXT:    s_clause 0x1
 ; GISEL-REAL16-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GISEL-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -43,6 +46,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_v:
 ; GISEL-FAKE16:       ; %bb.0:
+; GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-FAKE16-NEXT:    s_clause 0x1
 ; GISEL-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -59,6 +63,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_v(i32 %src, ptr %out) #1 {
 define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
 ; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_s:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; SDAG-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, s8
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
@@ -68,6 +73,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_s:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v1, s8
@@ -77,6 +83,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_s:
 ; GISEL-REAL16:       ; %bb.0:
+; GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-REAL16-NEXT:    s_clause 0x1
 ; GISEL-REAL16-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GISEL-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -88,6 +95,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_s:
 ; GISEL-FAKE16:       ; %bb.0:
+; GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-FAKE16-NEXT:    s_clause 0x1
 ; GISEL-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -104,6 +112,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_s(i32 inreg %src, ptr %out) #1 {
 define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
 ; SDAG-REAL16-LABEL: sat_pk4_i4_i8_f32_i:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, 0x64
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
@@ -113,6 +122,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_i4_i8_f32_i:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v1, 0x64
@@ -122,6 +132,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_i4_i8_f32_i:
 ; GISEL-REAL16:       ; %bb.0:
+; GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GISEL-REAL16-NEXT:    v_sat_pk4_i4_i8_e32 v0.l, 0x64
 ; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
@@ -131,6 +142,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_i4_i8_f32_i:
 ; GISEL-FAKE16:       ; %bb.0:
+; GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GISEL-FAKE16-NEXT:    v_sat_pk4_i4_i8_e32 v0, 0x64
 ; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
@@ -145,6 +157,7 @@ define amdgpu_kernel void @sat_pk4_i4_i8_f32_i(ptr %out) #1 {
 define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
 ; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_v:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_clause 0x1
 ; SDAG-REAL16-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -156,6 +169,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_v:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_clause 0x1
 ; SDAG-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -167,6 +181,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_v:
 ; GISEL-REAL16:       ; %bb.0:
+; GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-REAL16-NEXT:    s_clause 0x1
 ; GISEL-REAL16-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GISEL-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -178,6 +193,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_v:
 ; GISEL-FAKE16:       ; %bb.0:
+; GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-FAKE16-NEXT:    s_clause 0x1
 ; GISEL-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -194,6 +210,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_v(i32 %src, ptr %out) #1 {
 define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
 ; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_s:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; SDAG-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, s8
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
@@ -203,6 +220,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_s:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v1, s8
@@ -212,6 +230,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_s:
 ; GISEL-REAL16:       ; %bb.0:
+; GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-REAL16-NEXT:    s_clause 0x1
 ; GISEL-REAL16-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GISEL-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -223,6 +242,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_s:
 ; GISEL-FAKE16:       ; %bb.0:
+; GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-FAKE16-NEXT:    s_clause 0x1
 ; GISEL-FAKE16-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -239,6 +259,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_s(i32 inreg %src, ptr %out) #1 {
 define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
 ; SDAG-REAL16-LABEL: sat_pk4_u4_u8_f32_i:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, 0x64
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
@@ -248,6 +269,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: sat_pk4_u4_u8_f32_i:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v0, 0
 ; SDAG-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v1, 0x64
@@ -257,6 +279,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
 ;
 ; GISEL-REAL16-LABEL: sat_pk4_u4_u8_f32_i:
 ; GISEL-REAL16:       ; %bb.0:
+; GISEL-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GISEL-REAL16-NEXT:    v_sat_pk4_u4_u8_e32 v0.l, 0x64
 ; GISEL-REAL16-NEXT:    v_mov_b32_e32 v1, 0
@@ -266,6 +289,7 @@ define amdgpu_kernel void @sat_pk4_u4_u8_f32_i(ptr %out) #1 {
 ;
 ; GISEL-FAKE16-LABEL: sat_pk4_u4_u8_f32_i:
 ; GISEL-FAKE16:       ; %bb.0:
+; GISEL-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GISEL-FAKE16-NEXT:    v_sat_pk4_u4_u8_e32 v0, 0x64
 ; GISEL-FAKE16-NEXT:    v_mov_b32_e32 v1, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll
index 5c439f631a426..74b1818c44d45 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scale.pk.ll
@@ -21,6 +21,7 @@ declare <16 x float> @llvm.amdgcn.cvt.scale.pk16.f32.bf6(<3 x i32> %src, i32 %sc
 define amdgpu_ps void @test_cvt_scale_pk8_f16_fp8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f16_fp8_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v8, v3
 ; GFX1250-SDAG-NEXT:    v_cvt_scale_pk8_f16_fp8 v[4:7], v[0:1], v2 scale_sel:1
 ; GFX1250-SDAG-NEXT:    global_store_b128 v[8:9], v[4:7], off
@@ -28,6 +29,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f16_fp8_vv(<2 x i32> %src, i32 %scale,
 ;
 ; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f16_fp8_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    v_cvt_scale_pk8_f16_fp8 v[4:7], v[0:1], v2 scale_sel:1
 ; GFX1250-GISEL-NEXT:    global_store_b128 v[8:9], v[4:7], off
@@ -40,6 +42,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f16_fp8_vv(<2 x i32> %src, i32 %scale,
 define amdgpu_ps void @test_cvt_scale_pk8_f16_bf8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f16_bf8_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v8, v3
 ; GFX1250-SDAG-NEXT:    v_cvt_scale_pk8_f16_bf8 v[4:7], v[0:1], v2
 ; GFX1250-SDAG-NEXT:    global_store_b128 v[8:9], v[4:7], off
@@ -47,6 +50,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f16_bf8_vv(<2 x i32> %src, i32 %scale,
 ;
 ; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f16_bf8_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
 ; GFX1250-GISEL-NEXT:    v_cvt_scale_pk8_f16_bf8 v[4:7], v[0:1], v2
 ; GFX1250-GISEL-NEXT:    global_store_b128 v[8:9], v[4:7], off
@@ -59,6 +63,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f16_bf8_vv(<2 x i32> %src, i32 %scale,
 define amdgpu_ps void @test_cvt_scale_pk8_bf16_fp8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_cvt_scale_pk8_bf16_fp8_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v8, v3
 ; GFX1250-NEXT:    v_cvt_scale_pk8_bf16_fp8 v[4:7], v[0:1], v2 scale_sel:1
 ; GFX1250-NEXT:    global_store_b128 v[8:9], v[4:7], off
@@ -71,6 +76,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_bf16_fp8_vv(<2 x i32> %src, i32 %scale
 define amdgpu_ps void @test_cvt_scale_pk8_bf16_bf8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_cvt_scale_pk8_bf16_bf8_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v8, v3
 ; GFX1250-NEXT:    v_cvt_scale_pk8_bf16_bf8 v[4:7], v[0:1], v2 scale_sel:2
 ; GFX1250-NEXT:    global_store_b128 v[8:9], v[4:7], off
@@ -83,6 +89,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_bf16_bf8_vv(<2 x i32> %src, i32 %scale
 define amdgpu_ps void @test_cvt_scale_pk8_f16_fp4_vv(i32 %src, i32 %scale, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_cvt_scale_pk8_f16_fp4_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_scale_pk8_f16_fp4 v[4:7], v0, v1 scale_sel:3
 ; GFX1250-NEXT:    global_store_b128 v[2:3], v[4:7], off
 ; GFX1250-NEXT:    s_endpgm
@@ -94,6 +101,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f16_fp4_vv(i32 %src, i32 %scale, ptr a
 define amdgpu_ps void @test_cvt_scale_pk8_bf16_fp4_vv(i32 %src, i32 %scale, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_cvt_scale_pk8_bf16_fp4_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_scale_pk8_bf16_fp4 v[4:7], v0, v1 scale_sel:4
 ; GFX1250-NEXT:    global_store_b128 v[2:3], v[4:7], off
 ; GFX1250-NEXT:    s_endpgm
@@ -105,6 +113,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_bf16_fp4_vv(i32 %src, i32 %scale, ptr
 define amdgpu_ps void @test_cvt_scale_pk8_f32_fp8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f32_fp8_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v13, v4 :: v_dual_mov_b32 v12, v3
 ; GFX1250-SDAG-NEXT:    v_cvt_scale_pk8_f32_fp8 v[4:11], v[0:1], v2 scale_sel:8
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
@@ -114,6 +123,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f32_fp8_vv(<2 x i32> %src, i32 %scale,
 ;
 ; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f32_fp8_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v13, v4
 ; GFX1250-GISEL-NEXT:    v_cvt_scale_pk8_f32_fp8 v[4:11], v[0:1], v2 scale_sel:8
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
@@ -128,6 +138,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f32_fp8_vv(<2 x i32> %src, i32 %scale,
 define amdgpu_ps void @test_cvt_scale_pk8_f32_bf8_vv(<2 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f32_bf8_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v13, v4 :: v_dual_mov_b32 v12, v3
 ; GFX1250-SDAG-NEXT:    v_cvt_scale_pk8_f32_bf8 v[4:11], v[0:1], v2
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
@@ -137,6 +148,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f32_bf8_vv(<2 x i32> %src, i32 %scale,
 ;
 ; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f32_bf8_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v12, v3 :: v_dual_mov_b32 v13, v4
 ; GFX1250-GISEL-NEXT:    v_cvt_scale_pk8_f32_bf8 v[4:11], v[0:1], v2
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
@@ -151,6 +163,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f32_bf8_vv(<2 x i32> %src, i32 %scale,
 define amdgpu_ps void @test_cvt_scale_pk8_f32_fp4_vv(i32 %src, i32 %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_cvt_scale_pk8_f32_fp4_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_cvt_scale_pk8_f32_fp4 v[4:11], v0, v1 scale_sel:1
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
 ; GFX1250-SDAG-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
@@ -159,6 +172,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f32_fp4_vv(i32 %src, i32 %scale, ptr a
 ;
 ; GFX1250-GISEL-LABEL: test_cvt_scale_pk8_f32_fp4_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_cvt_scale_pk8_f32_fp4 v[4:11], v0, v1 scale_sel:1
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
 ; GFX1250-GISEL-NEXT:    global_store_b128 v[2:3], v[4:7], off
@@ -172,6 +186,7 @@ define amdgpu_ps void @test_cvt_scale_pk8_f32_fp4_vv(i32 %src, i32 %scale, ptr a
 define amdgpu_ps void @test_cvt_scale_pk16_f16_fp6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f16_fp6_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_cvt_scale_pk16_f16_fp6 v[6:13], v[0:2], v3
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
 ; GFX1250-SDAG-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
@@ -180,6 +195,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f16_fp6_vv(<3 x i32> %src, i32 %scale
 ;
 ; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f16_fp6_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_cvt_scale_pk16_f16_fp6 v[6:13], v[0:2], v3
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
 ; GFX1250-GISEL-NEXT:    global_store_b128 v[4:5], v[6:9], off
@@ -193,6 +209,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f16_fp6_vv(<3 x i32> %src, i32 %scale
 define amdgpu_ps void @test_cvt_scale_pk16_f16_fp6_sl(<3 x i32> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f16_fp6_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v12, s2
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -204,6 +221,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f16_fp6_sl(<3 x i32> inreg %src, ptr
 ;
 ; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f16_fp6_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v10, s0
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -220,6 +238,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f16_fp6_sl(<3 x i32> inreg %src, ptr
 define amdgpu_ps void @test_cvt_scale_pk16_bf16_fp6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_cvt_scale_pk16_bf16_fp6_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_scale_pk16_bf16_fp6 v[6:13], v[0:2], v3 scale_sel:2
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
@@ -233,6 +252,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_bf16_fp6_vv(<3 x i32> %src, i32 %scal
 define amdgpu_ps void @test_cvt_scale_pk16_bf16_fp6_sl(<3 x i32> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_cvt_scale_pk16_bf16_fp6_sl:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1
 ; GFX1250-NEXT:    v_mov_b32_e32 v12, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -249,6 +269,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_bf16_fp6_sl(<3 x i32> inreg %src, ptr
 define amdgpu_ps void @test_cvt_scale_pk16_f16_bf6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f16_bf6_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_cvt_scale_pk16_f16_bf6 v[6:13], v[0:2], v3 scale_sel:4
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
 ; GFX1250-SDAG-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
@@ -257,6 +278,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f16_bf6_vv(<3 x i32> %src, i32 %scale
 ;
 ; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f16_bf6_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_cvt_scale_pk16_f16_bf6 v[6:13], v[0:2], v3 scale_sel:4
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
 ; GFX1250-GISEL-NEXT:    global_store_b128 v[4:5], v[6:9], off
@@ -270,6 +292,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f16_bf6_vv(<3 x i32> %src, i32 %scale
 define amdgpu_ps void @test_cvt_scale_pk16_f16_bf6_sl(<3 x i32> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f16_bf6_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1
 ; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v12, s2
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -281,6 +304,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f16_bf6_sl(<3 x i32> inreg %src, ptr
 ;
 ; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f16_bf6_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v12, s2 :: v_dual_mov_b32 v11, s1
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v10, s0
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -297,6 +321,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f16_bf6_sl(<3 x i32> inreg %src, ptr
 define amdgpu_ps void @test_cvt_scale_pk16_bf16_bf6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_cvt_scale_pk16_bf16_bf6_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_scale_pk16_bf16_bf6 v[6:13], v[0:2], v3 scale_sel:6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
@@ -310,6 +335,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_bf16_bf6_vv(<3 x i32> %src, i32 %scal
 define amdgpu_ps void @test_cvt_scale_pk16_bf16_bf6_sl(<3 x i32> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_cvt_scale_pk16_bf16_bf6_sl:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v10, s0 :: v_dual_mov_b32 v11, s1
 ; GFX1250-NEXT:    v_mov_b32_e32 v12, s2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -326,6 +352,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_bf16_bf6_sl(<3 x i32> inreg %src, ptr
 define amdgpu_ps void @test_cvt_scale_pk16_f32_fp6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f32_fp6_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_cvt_scale_pk16_f32_fp6 v[6:21], v[0:2], v3 scale_sel:5
 ; GFX1250-SDAG-NEXT:    s_clause 0x3
 ; GFX1250-SDAG-NEXT:    global_store_b128 v[4:5], v[18:21], off offset:48
@@ -336,6 +363,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f32_fp6_vv(<3 x i32> %src, i32 %scale
 ;
 ; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f32_fp6_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_cvt_scale_pk16_f32_fp6 v[6:21], v[0:2], v3 scale_sel:5
 ; GFX1250-GISEL-NEXT:    s_clause 0x3
 ; GFX1250-GISEL-NEXT:    global_store_b128 v[4:5], v[6:9], off
@@ -351,6 +379,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f32_fp6_vv(<3 x i32> %src, i32 %scale
 define amdgpu_ps void @test_cvt_scale_pk16_f32_bf6_vv(<3 x i32> %src, i32 %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_cvt_scale_pk16_f32_bf6_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_cvt_scale_pk16_f32_bf6 v[6:21], v[0:2], v3 scale_sel:6
 ; GFX1250-SDAG-NEXT:    s_clause 0x3
 ; GFX1250-SDAG-NEXT:    global_store_b128 v[4:5], v[18:21], off offset:48
@@ -361,6 +390,7 @@ define amdgpu_ps void @test_cvt_scale_pk16_f32_bf6_vv(<3 x i32> %src, i32 %scale
 ;
 ; GFX1250-GISEL-LABEL: test_cvt_scale_pk16_f32_bf6_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_cvt_scale_pk16_f32_bf6 v[6:21], v[0:2], v3 scale_sel:6
 ; GFX1250-GISEL-NEXT:    s_clause 0x3
 ; GFX1250-GISEL-NEXT:    global_store_b128 v[4:5], v[6:9], off

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll
index dfb908930750f..ef3147ec5c5a6 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk16.gfx1250.ll
@@ -13,6 +13,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f32_vv(<16 x float> %src, float %s
 ; GFX1210-SDAG-LABEL: test_scalef32_pk16_bf6_f32_vv:
 ; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_f32_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v22, v17
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_pk16_bf6_f32 v[18:20], v[0:15], v16
 ; GFX1250-SDAG-NEXT:    global_store_b96 v[22:23], v[18:20], off
@@ -20,6 +21,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f32_vv(<16 x float> %src, float %s
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_f32_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v22, v17 :: v_dual_mov_b32 v23, v18
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_pk16_bf6_f32 v[18:20], v[0:15], v16
 ; GFX1250-GISEL-NEXT:    global_store_b96 v[22:23], v[18:20], off
@@ -32,6 +34,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f32_vv(<16 x float> %src, float %s
 define amdgpu_ps void @test_scalef32_pk16_bf6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_f32_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -47,6 +50,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f32_sl(<16 x float> inreg %src, pt
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_f32_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[14:15]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[10:11]
@@ -67,6 +71,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f32_sl(<16 x float> inreg %src, pt
 define amdgpu_ps void @test_scalef32_pk16_fp6_f32_vv(<16 x float> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_f32_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v23, v18 :: v_dual_mov_b32 v22, v17
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_pk16_fp6_f32 v[18:20], v[0:15], v16
 ; GFX1250-SDAG-NEXT:    global_store_b96 v[22:23], v[18:20], off
@@ -74,6 +79,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_f32_vv(<16 x float> %src, float %s
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_f32_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v22, v17 :: v_dual_mov_b32 v23, v18
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_pk16_fp6_f32 v[18:20], v[0:15], v16
 ; GFX1250-GISEL-NEXT:    global_store_b96 v[22:23], v[18:20], off
@@ -86,6 +92,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_f32_vv(<16 x float> %src, float %s
 define amdgpu_ps void @test_scalef32_pk16_fp6_f32_sl(<16 x float> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_f32_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -101,6 +108,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_f32_sl(<16 x float> inreg %src, pt
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_f32_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[14:15]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[10:11]
@@ -121,6 +129,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_f32_sl(<16 x float> inreg %src, pt
 define amdgpu_ps void @test_scalef32_pk16_bf6_bf16_vv(<16 x bfloat> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_bf16_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[0:7], v8
 ; GFX1250-SDAG-NEXT:    global_store_b96 v[14:15], v[10:12], off
@@ -128,6 +137,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_bf16_vv(<16 x bfloat> %src, float
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_bf16_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_pk16_bf6_bf16 v[10:12], v[0:7], v8
 ; GFX1250-GISEL-NEXT:    global_store_b96 v[14:15], v[10:12], off
@@ -140,6 +150,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_bf16_vv(<16 x bfloat> %src, float
 define amdgpu_ps void @test_scalef32_pk16_bf6_bf16_sl(<16 x bfloat> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_bf16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -151,6 +162,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_bf16_sl(<16 x bfloat> inreg %src,
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_bf16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -167,6 +179,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_bf16_sl(<16 x bfloat> inreg %src,
 define amdgpu_ps void @test_scalef32_pk16_bf6_f16_vv(<16 x half> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_f16_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[0:7], v8
 ; GFX1250-SDAG-NEXT:    global_store_b96 v[14:15], v[10:12], off
@@ -174,6 +187,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f16_vv(<16 x half> %src, float %sc
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_f16_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v14, v9 :: v_dual_mov_b32 v15, v10
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_pk16_bf6_f16 v[10:12], v[0:7], v8
 ; GFX1250-GISEL-NEXT:    global_store_b96 v[14:15], v[10:12], off
@@ -186,6 +200,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f16_vv(<16 x half> %src, float %sc
 define amdgpu_ps void @test_scalef32_pk16_bf6_f16_sl(<16 x half> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk16_bf6_f16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -197,6 +212,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f16_sl(<16 x half> inreg %src, ptr
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk16_bf6_f16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
@@ -213,6 +229,7 @@ define amdgpu_ps void @test_scalef32_pk16_bf6_f16_sl(<16 x half> inreg %src, ptr
 define amdgpu_ps void @test_scalef32_pk16_fp6_bf16_vv(<16 x bfloat> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_bf16_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[0:7], v8
 ; GFX1250-SDAG-NEXT:    global_store_b96 v[14:15], v[10:12], off
@@ -220,6 +237,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_bf16_vv(<16 x bfloat> %src, float
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_bf16_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_pk16_fp6_bf16 v[10:12], v[0:7], v8
 ; GFX1250-GISEL-NEXT:    global_store_b96 v[14:15], v[10:12], off
@@ -232,6 +250,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_bf16_vv(<16 x bfloat> %src, float
 define amdgpu_ps void @test_scalef32_pk16_fp6_bf16_sl(<16 x bfloat> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_bf16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -243,6 +262,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_bf16_sl(<16 x bfloat> inreg %src,
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_bf16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -259,6 +279,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_bf16_sl(<16 x bfloat> inreg %src,
 define amdgpu_ps void @test_scalef32_pk16_fp6_f16_vv(<16 x half> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_f16_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v14, v9
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[0:7], v8
 ; GFX1250-SDAG-NEXT:    global_store_b96 v[14:15], v[10:12], off
@@ -266,6 +287,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_f16_vv(<16 x half> %src, float %sc
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_f16_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v14, v9 :: v_dual_mov_b32 v15, v10
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_pk16_fp6_f16 v[10:12], v[0:7], v8
 ; GFX1250-GISEL-NEXT:    global_store_b96 v[14:15], v[10:12], off
@@ -278,6 +300,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_f16_vv(<16 x half> %src, float %sc
 define amdgpu_ps void @test_scalef32_pk16_fp6_f16_sl(<16 x half> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk16_fp6_f16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -289,6 +312,7 @@ define amdgpu_ps void @test_scalef32_pk16_fp6_f16_sl(<16 x half> inreg %src, ptr
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk16_fp6_f16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll
index cd0b081bf6f10..6839975c3270c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.pk8.ll
@@ -15,6 +15,7 @@ declare i32 @llvm.amdgcn.cvt.scalef32.pk8.fp4.bf16(<8 x bfloat> %src, float %sca
 define amdgpu_ps void @test_scalef32_pk8_fp8_bf16_vv(<8 x bfloat> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_bf16_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_pk8_fp8_bf16 v[8:9], v[0:3], v4
 ; GFX1250-SDAG-NEXT:    global_store_b64 v[6:7], v[8:9], off
@@ -22,6 +23,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_bf16_vv(<8 x bfloat> %src, float %s
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_bf16_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_pk8_fp8_bf16 v[8:9], v[0:3], v4
 ; GFX1250-GISEL-NEXT:    global_store_b64 v[6:7], v[8:9], off
@@ -34,6 +36,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_bf16_vv(<8 x bfloat> %src, float %s
 define amdgpu_ps void @test_scalef32_pk8_fp8_bf16_sl(<8 x bfloat> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_bf16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -43,6 +46,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_bf16_sl(<8 x bfloat> inreg %src, pt
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_bf16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -57,6 +61,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_bf16_sl(<8 x bfloat> inreg %src, pt
 define amdgpu_ps void @test_scalef32_pk8_bf8_bf16_vv(<8 x bfloat> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_bf16_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_pk8_bf8_bf16 v[8:9], v[0:3], v4
 ; GFX1250-SDAG-NEXT:    global_store_b64 v[6:7], v[8:9], off
@@ -64,6 +69,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_bf16_vv(<8 x bfloat> %src, float %s
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_bf16_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_pk8_bf8_bf16 v[8:9], v[0:3], v4
 ; GFX1250-GISEL-NEXT:    global_store_b64 v[6:7], v[8:9], off
@@ -76,6 +82,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_bf16_vv(<8 x bfloat> %src, float %s
 define amdgpu_ps void @test_scalef32_pk8_bf8_bf16_sl(<8 x bfloat> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_bf16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -85,6 +92,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_bf16_sl(<8 x bfloat> inreg %src, pt
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_bf16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -99,6 +107,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_bf16_sl(<8 x bfloat> inreg %src, pt
 define amdgpu_ps void @test_scalef32_pk8_fp8_f16_vv(<8 x half> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_f16_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_pk8_fp8_f16 v[8:9], v[0:3], v4
 ; GFX1250-SDAG-NEXT:    global_store_b64 v[6:7], v[8:9], off
@@ -106,6 +115,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_f16_vv(<8 x half> %src, float %scal
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_f16_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v9, v6
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_pk8_fp8_f16 v[6:7], v[0:3], v4
 ; GFX1250-GISEL-NEXT:    global_store_b64 v[8:9], v[6:7], off
@@ -118,6 +128,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_f16_vv(<8 x half> %src, float %scal
 define amdgpu_ps void @test_scalef32_pk8_fp8_f16_sl(<8 x half> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_f16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -127,6 +138,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_f16_sl(<8 x half> inreg %src, ptr a
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_f16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -141,6 +153,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_f16_sl(<8 x half> inreg %src, ptr a
 define amdgpu_ps void @test_scalef32_pk8_bf8_f16_vv(<8 x half> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_f16_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_pk8_bf8_f16 v[8:9], v[0:3], v4
 ; GFX1250-SDAG-NEXT:    global_store_b64 v[6:7], v[8:9], off
@@ -148,6 +161,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_f16_vv(<8 x half> %src, float %scal
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_f16_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v9, v6
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_pk8_bf8_f16 v[6:7], v[0:3], v4
 ; GFX1250-GISEL-NEXT:    global_store_b64 v[8:9], v[6:7], off
@@ -160,6 +174,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_f16_vv(<8 x half> %src, float %scal
 define amdgpu_ps void @test_scalef32_pk8_bf8_f16_sl(<8 x half> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_f16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -169,6 +184,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_f16_sl(<8 x half> inreg %src, ptr a
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_f16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -183,6 +199,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_f16_sl(<8 x half> inreg %src, ptr a
 define amdgpu_ps void @test_scalef32_pk8_bf8_f32_vv(<8 x float> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_f32_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v9
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_pk8_bf8_f32 v[12:13], v[0:7], v8
 ; GFX1250-SDAG-NEXT:    global_store_b64 v[10:11], v[12:13], off
@@ -190,6 +207,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_f32_vv(<8 x float> %src, float %sca
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_f32_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v10
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_pk8_bf8_f32 v[10:11], v[0:7], v8
 ; GFX1250-GISEL-NEXT:    global_store_b64 v[12:13], v[10:11], off
@@ -202,6 +220,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_f32_vv(<8 x float> %src, float %sca
 define amdgpu_ps void @test_scalef32_pk8_bf8_f32_sl(<8 x float> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_bf8_f32_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -213,6 +232,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_f32_sl(<8 x float> inreg %src, ptr
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_bf8_f32_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
@@ -229,6 +249,7 @@ define amdgpu_ps void @test_scalef32_pk8_bf8_f32_sl(<8 x float> inreg %src, ptr
 define amdgpu_ps void @test_scalef32_pk8_fp8_f32_vv(<8 x float> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_f32_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v9
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_pk8_fp8_f32 v[12:13], v[0:7], v8
 ; GFX1250-SDAG-NEXT:    global_store_b64 v[10:11], v[12:13], off
@@ -236,6 +257,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_f32_vv(<8 x float> %src, float %sca
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_f32_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v10
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_pk8_fp8_f32 v[10:11], v[0:7], v8
 ; GFX1250-GISEL-NEXT:    global_store_b64 v[12:13], v[10:11], off
@@ -248,6 +270,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_f32_vv(<8 x float> %src, float %sca
 define amdgpu_ps void @test_scalef32_pk8_fp8_f32_sl(<8 x float> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp8_f32_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -259,6 +282,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_f32_sl(<8 x float> inreg %src, ptr
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp8_f32_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
@@ -275,6 +299,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp8_f32_sl(<8 x float> inreg %src, ptr
 define amdgpu_ps void @test_scalef32_pk8_fp4_f32_vv(<8 x float> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_f32_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v10, v9
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_pk8_fp4_f32 v9, v[0:7], v8
 ; GFX1250-SDAG-NEXT:    global_store_b32 v[10:11], v9, off
@@ -282,6 +307,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_f32_vv(<8 x float> %src, float %sca
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_f32_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v10
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_pk8_fp4_f32 v9, v[0:7], v8
 ; GFX1250-GISEL-NEXT:    global_store_b32 v[12:13], v9, off
@@ -294,6 +320,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_f32_vv(<8 x float> %src, float %sca
 define amdgpu_ps void @test_scalef32_pk8_fp4_f32_sl(<8 x float> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_f32_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -305,6 +332,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_f32_sl(<8 x float> inreg %src, ptr
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_f32_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
@@ -321,6 +349,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_f32_sl(<8 x float> inreg %src, ptr
 define amdgpu_ps void @test_scalef32_pk8_fp4_f16_vv(<8 x half> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_f16_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_pk8_fp4_f16 v5, v[0:3], v4
 ; GFX1250-SDAG-NEXT:    global_store_b32 v[6:7], v5, off
@@ -328,6 +357,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_f16_vv(<8 x half> %src, float %scal
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_f16_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v8, v5 :: v_dual_mov_b32 v9, v6
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_pk8_fp4_f16 v5, v[0:3], v4
 ; GFX1250-GISEL-NEXT:    global_store_b32 v[8:9], v5, off
@@ -340,6 +370,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_f16_vv(<8 x half> %src, float %scal
 define amdgpu_ps void @test_scalef32_pk8_fp4_f16_sl(<8 x half> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_f16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -349,6 +380,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_f16_sl(<8 x half> inreg %src, ptr a
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_f16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -363,6 +395,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_f16_sl(<8 x half> inreg %src, ptr a
 define amdgpu_ps void @test_scalef32_pk8_fp4_bf16_vv(<8 x bfloat> %src, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_bf16_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_pk8_fp4_bf16 v5, v[0:3], v4
 ; GFX1250-SDAG-NEXT:    global_store_b32 v[6:7], v5, off
@@ -370,6 +403,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_bf16_vv(<8 x bfloat> %src, float %s
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_bf16_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_pk8_fp4_bf16 v5, v[0:3], v4
 ; GFX1250-GISEL-NEXT:    global_store_b32 v[6:7], v5, off
@@ -382,6 +416,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_bf16_vv(<8 x bfloat> %src, float %s
 define amdgpu_ps void @test_scalef32_pk8_fp4_bf16_sl(<8 x bfloat> inreg %src, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_pk8_fp4_bf16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -391,6 +426,7 @@ define amdgpu_ps void @test_scalef32_pk8_fp4_bf16_sl(<8 x bfloat> inreg %src, pt
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_pk8_fp4_bf16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll
index d33acf6ca7f76..b4789541044b1 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk.gfx1250.ll
@@ -15,12 +15,14 @@ declare i32 @llvm.amdgcn.cvt.scalef32.sr.pk8.fp4.bf16(<8 x bfloat> %src, i32 %sr
 define amdgpu_ps void @test_scalef32_sr_pk8_fp8_bf16_vv(<8 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_bf16_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_sr_pk8_fp8_bf16 v[8:9], v[0:3], v4, v5
 ; GFX1250-SDAG-NEXT:    global_store_b64 v[6:7], v[8:9], off
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_bf16_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_sr_pk8_fp8_bf16 v[8:9], v[0:3], v4, v5
 ; GFX1250-GISEL-NEXT:    global_store_b64 v[6:7], v[8:9], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -32,6 +34,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_bf16_vv(<8 x bfloat> %src, i32 %
 define amdgpu_ps void @test_scalef32_sr_pk8_fp8_bf16_sl(<8 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_bf16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -41,6 +44,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_bf16_sl(<8 x bfloat> inreg %src,
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_bf16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -55,12 +59,14 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_bf16_sl(<8 x bfloat> inreg %src,
 define amdgpu_ps void @test_scalef32_sr_pk8_bf8_bf16_vv(<8 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_bf16_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_sr_pk8_bf8_bf16 v[8:9], v[0:3], v4, v5
 ; GFX1250-SDAG-NEXT:    global_store_b64 v[6:7], v[8:9], off
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_bf16_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_sr_pk8_bf8_bf16 v[8:9], v[0:3], v4, v5
 ; GFX1250-GISEL-NEXT:    global_store_b64 v[6:7], v[8:9], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -72,6 +78,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_bf16_vv(<8 x bfloat> %src, i32 %
 define amdgpu_ps void @test_scalef32_sr_pk8_bf8_bf16_sl(<8 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_bf16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -81,6 +88,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_bf16_sl(<8 x bfloat> inreg %src,
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_bf16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -95,12 +103,14 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_bf16_sl(<8 x bfloat> inreg %src,
 define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f16_vv(<8 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_f16_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_sr_pk8_fp8_f16 v[8:9], v[0:3], v4, v5
 ; GFX1250-SDAG-NEXT:    global_store_b64 v[6:7], v[8:9], off
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_f16_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_sr_pk8_fp8_f16 v[8:9], v[0:3], v4, v5
 ; GFX1250-GISEL-NEXT:    global_store_b64 v[6:7], v[8:9], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -112,6 +122,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f16_vv(<8 x half> %src, i32 %sr,
 define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f16_sl(<8 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_f16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -121,6 +132,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f16_sl(<8 x half> inreg %src, i3
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_f16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -135,12 +147,14 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f16_sl(<8 x half> inreg %src, i3
 define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f16_vv(<8 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_f16_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_sr_pk8_bf8_f16 v[8:9], v[0:3], v4, v5
 ; GFX1250-SDAG-NEXT:    global_store_b64 v[6:7], v[8:9], off
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_f16_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_sr_pk8_bf8_f16 v[8:9], v[0:3], v4, v5
 ; GFX1250-GISEL-NEXT:    global_store_b64 v[6:7], v[8:9], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -152,6 +166,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f16_vv(<8 x half> %src, i32 %sr,
 define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f16_sl(<8 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_f16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -161,6 +176,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f16_sl(<8 x half> inreg %src, i3
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_f16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -175,12 +191,14 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f16_sl(<8 x half> inreg %src, i3
 define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f32_vv(<8 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_f32_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_sr_pk8_bf8_f32 v[12:13], v[0:7], v8, v9
 ; GFX1250-SDAG-NEXT:    global_store_b64 v[10:11], v[12:13], off
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_f32_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_sr_pk8_bf8_f32 v[12:13], v[0:7], v8, v9
 ; GFX1250-GISEL-NEXT:    global_store_b64 v[10:11], v[12:13], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -192,6 +210,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f32_vv(<8 x float> %src, i32 %sr
 define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f32_sl(<8 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_bf8_f32_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -203,6 +222,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f32_sl(<8 x float> inreg %src, i
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_bf8_f32_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
@@ -219,12 +239,14 @@ define amdgpu_ps void @test_scalef32_sr_pk8_bf8_f32_sl(<8 x float> inreg %src, i
 define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f32_vv(<8 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_f32_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_sr_pk8_fp8_f32 v[12:13], v[0:7], v8, v9
 ; GFX1250-SDAG-NEXT:    global_store_b64 v[10:11], v[12:13], off
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_f32_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_sr_pk8_fp8_f32 v[12:13], v[0:7], v8, v9
 ; GFX1250-GISEL-NEXT:    global_store_b64 v[10:11], v[12:13], off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -236,6 +258,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f32_vv(<8 x float> %src, i32 %sr
 define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f32_sl(<8 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp8_f32_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -247,6 +270,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f32_sl(<8 x float> inreg %src, i
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp8_f32_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
@@ -263,12 +287,14 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp8_f32_sl(<8 x float> inreg %src, i
 define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f32_vv(<8 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_f32_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_sr_pk8_fp4_f32 v12, v[0:7], v8, v9
 ; GFX1250-SDAG-NEXT:    global_store_b32 v[10:11], v12, off
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_f32_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_sr_pk8_fp4_f32 v12, v[0:7], v8, v9
 ; GFX1250-GISEL-NEXT:    global_store_b32 v[10:11], v12, off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -280,6 +306,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f32_vv(<8 x float> %src, i32 %sr
 define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f32_sl(<8 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_f32_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -291,6 +318,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f32_sl(<8 x float> inreg %src, i
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_f32_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
@@ -307,12 +335,14 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f32_sl(<8 x float> inreg %src, i
 define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f16_vv(<8 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_f16_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_sr_pk8_fp4_f16 v8, v[0:3], v4, v5
 ; GFX1250-SDAG-NEXT:    global_store_b32 v[6:7], v8, off
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_f16_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_sr_pk8_fp4_f16 v8, v[0:3], v4, v5
 ; GFX1250-GISEL-NEXT:    global_store_b32 v[6:7], v8, off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -324,6 +354,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f16_vv(<8 x half> %src, i32 %sr,
 define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f16_sl(<8 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_f16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -333,6 +364,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f16_sl(<8 x half> inreg %src, i3
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_f16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[2:3], s[0:1]
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -347,12 +379,14 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp4_f16_sl(<8 x half> inreg %src, i3
 define amdgpu_ps void @test_scalef32_sr_pk8_fp4_bf16_vv(<8 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_bf16_vv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_cvt_scalef32_sr_pk8_fp4_bf16 v8, v[0:3], v4, v5
 ; GFX1250-SDAG-NEXT:    global_store_b32 v[6:7], v8, off
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_bf16_vv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_cvt_scalef32_sr_pk8_fp4_bf16 v8, v[0:3], v4, v5
 ; GFX1250-GISEL-NEXT:    global_store_b32 v[6:7], v8, off
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -364,6 +398,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp4_bf16_vv(<8 x bfloat> %src, i32 %
 define amdgpu_ps void @test_scalef32_sr_pk8_fp4_bf16_sl(<8 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk8_fp4_bf16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -373,6 +408,7 @@ define amdgpu_ps void @test_scalef32_sr_pk8_fp4_bf16_sl(<8 x bfloat> inreg %src,
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk8_fp4_bf16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll
index c4395182d6719..0e20fc69886db 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.scalef32.sr.pk16.ll
@@ -12,6 +12,7 @@ declare <3 x i32> @llvm.amdgcn.cvt.scalef32.sr.pk16.fp6.f32(<16 x float> %src, i
 define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_vv(<16 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_bf16_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_scalef32_sr_pk16_bf6_bf16 v[12:14], v[0:7], v8, v9
 ; GFX1250-NEXT:    global_store_b96 v[10:11], v[12:14], off
 ; GFX1250-NEXT:    s_endpgm
@@ -23,6 +24,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_vv(<16 x bfloat> %src, i32
 define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_sl(<16 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_bf16_sl:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -39,6 +41,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_bf6_bf16_sl(<16 x bfloat> inreg %sr
 define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_vv(<16 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_f16_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_scalef32_sr_pk16_bf6_f16 v[12:14], v[0:7], v8, v9
 ; GFX1250-NEXT:    global_store_b96 v[10:11], v[12:14], off
 ; GFX1250-NEXT:    s_endpgm
@@ -50,6 +53,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_vv(<16 x half> %src, i32 %s
 define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_sl(<16 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_bf6_f16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -61,6 +65,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_sl(<16 x half> inreg %src,
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk16_bf6_f16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
@@ -77,6 +82,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f16_sl(<16 x half> inreg %src,
 define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_vv(<16 x bfloat> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_bf16_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_scalef32_sr_pk16_fp6_bf16 v[12:14], v[0:7], v8, v9
 ; GFX1250-NEXT:    global_store_b96 v[10:11], v[12:14], off
 ; GFX1250-NEXT:    s_endpgm
@@ -88,6 +94,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_vv(<16 x bfloat> %src, i32
 define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_sl(<16 x bfloat> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_bf16_sl:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -104,6 +111,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_fp6_bf16_sl(<16 x bfloat> inreg %sr
 define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_vv(<16 x half> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_f16_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_scalef32_sr_pk16_fp6_f16 v[12:14], v[0:7], v8, v9
 ; GFX1250-NEXT:    global_store_b96 v[10:11], v[12:14], off
 ; GFX1250-NEXT:    s_endpgm
@@ -115,6 +123,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_vv(<16 x half> %src, i32 %s
 define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_sl(<16 x half> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_fp6_f16_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -126,6 +135,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_sl(<16 x half> inreg %src,
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk16_fp6_f16_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[8:9], s[6:7]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[6:7], s[4:5]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[4:5], s[2:3]
@@ -142,6 +152,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f16_sl(<16 x half> inreg %src,
 define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f32_vv(<16 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_scalef32_sr_pk16_bf6_f32_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_scalef32_sr_pk16_bf6_f32 v[20:22], v[0:15], v16, v17
 ; GFX1250-NEXT:    global_store_b96 v[18:19], v[20:22], off
 ; GFX1250-NEXT:    s_endpgm
@@ -153,6 +164,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f32_vv(<16 x float> %src, i32 %
 define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f32_sl(<16 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_bf6_f32_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -168,6 +180,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f32_sl(<16 x float> inreg %src,
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk16_bf6_f32_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[14:15]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[10:11]
@@ -188,6 +201,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_bf6_f32_sl(<16 x float> inreg %src,
 define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f32_vv(<16 x float> %src, i32 %sr, float %scale, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_scalef32_sr_pk16_fp6_f32_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_cvt_scalef32_sr_pk16_fp6_f32 v[20:22], v[0:15], v16, v17
 ; GFX1250-NEXT:    global_store_b96 v[18:19], v[20:22], off
 ; GFX1250-NEXT:    s_endpgm
@@ -199,6 +213,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f32_vv(<16 x float> %src, i32 %
 define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f32_sl(<16 x float> inreg %src, i32 inreg %sr, ptr addrspace(1) %out) {
 ; GFX1250-SDAG-LABEL: test_scalef32_sr_pk16_fp6_f32_sl:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, s4 :: v_dual_mov_b32 v7, s5
@@ -214,6 +229,7 @@ define amdgpu_ps void @test_scalef32_sr_pk16_fp6_f32_sl(<16 x float> inreg %src,
 ;
 ; GFX1250-GISEL-LABEL: test_scalef32_sr_pk16_fp6_f32_sl:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[16:17], s[14:15]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[14:15], s[12:13]
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[12:13], s[10:11]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.pk.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.pk.bf16.ll
index 82991aed706a1..1fb3b1a97f0e4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.pk.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.sr.pk.bf16.ll
@@ -9,6 +9,7 @@ declare <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float, float, i32) #0
 define amdgpu_ps float @cvt_sr_pk_bf16_f32_vvv(float %src0, float %src1, i32 %src2) #1 {
 ; GCN-LABEL: cvt_sr_pk_bf16_f32_vvv:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_cvt_sr_pk_bf16_f32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
   %cvt = call <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float %src0, float %src1, i32 %src2) #0
@@ -19,6 +20,7 @@ define amdgpu_ps float @cvt_sr_pk_bf16_f32_vvv(float %src0, float %src1, i32 %sr
 define amdgpu_ps float @cvt_sr_pk_bf16_f32_sss(float inreg %src0, float inreg %src1, i32 inreg %src2) #1 {
 ; GCN-LABEL: cvt_sr_pk_bf16_f32_sss:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s2
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    v_cvt_sr_pk_bf16_f32 v0, s0, s1, v0
@@ -31,6 +33,7 @@ define amdgpu_ps float @cvt_sr_pk_bf16_f32_sss(float inreg %src0, float inreg %s
 define amdgpu_ps float @cvt_sr_pk_bf16_f32_vvi(float %src0, float %src1) #1 {
 ; GCN-LABEL: cvt_sr_pk_bf16_f32_vvi:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_cvt_sr_pk_bf16_f32 v0, v0, v1, 0x10002
 ; GCN-NEXT:    ; return to shader part epilog
   %cvt = call <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float %src0, float %src1, i32 65538) #0
@@ -41,6 +44,7 @@ define amdgpu_ps float @cvt_sr_pk_bf16_f32_vvi(float %src0, float %src1) #1 {
 define amdgpu_ps float @cvt_sr_pk_bf16_f32_vvi_mods(float %src0, float %src1) #1 {
 ; GCN-LABEL: cvt_sr_pk_bf16_f32_vvi_mods:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_cvt_sr_pk_bf16_f32 v0, -v0, |v1|, 1
 ; GCN-NEXT:    ; return to shader part epilog
   %s0 = fneg float %src0
@@ -53,6 +57,7 @@ define amdgpu_ps float @cvt_sr_pk_bf16_f32_vvi_mods(float %src0, float %src1) #1
 define amdgpu_ps float @cvt_sr_pk_bf16_f32_ssi(float inreg %src0, float inreg %src1) #1 {
 ; GCN-LABEL: cvt_sr_pk_bf16_f32_ssi:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_cvt_sr_pk_bf16_f32 v0, s0, s1, 1
 ; GCN-NEXT:    ; return to shader part epilog
   %cvt = call <2 x bfloat> @llvm.amdgcn.cvt.sr.pk.bf16.f32(float %src0, float %src1, i32 1) #0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
index d5fba2df0b828..8f4a473a2d5ea 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.flat.prefetch.ll
@@ -7,6 +7,7 @@ declare void @llvm.amdgcn.flat.prefetch(ptr %ptr, i32 %col)
 define amdgpu_ps void @flat_prefetch(ptr %ptr) {
 ; GCN-LABEL: flat_prefetch:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_prefetch_b8 v[0:1]
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -17,6 +18,7 @@ entry:
 define amdgpu_ps void @flat_prefetch_sgpr(ptr inreg %ptr) {
 ; GCN-LABEL: flat_prefetch_sgpr:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    flat_prefetch_b8 v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
@@ -28,6 +30,7 @@ entry:
 define amdgpu_ps void @flat_prefetch_offset(ptr %ptr) {
 ; GCN-LABEL: flat_prefetch_offset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_prefetch_b8 v[0:1] offset:512
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -39,6 +42,7 @@ entry:
 define amdgpu_ps void @flat_prefetch_sgpr_voffset(ptr inreg %ptr, i32 %offset) {
 ; GCN-LABEL: flat_prefetch_sgpr_voffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_prefetch_b8 v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -50,6 +54,7 @@ entry:
 define amdgpu_ps void @flat_prefetch_sgpr_voffset_offset(ptr inreg %ptr, i32 %offset) {
 ; GCN-LABEL: flat_prefetch_sgpr_voffset_offset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_prefetch_b8 v0, s[0:1] offset:128
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -62,6 +67,7 @@ entry:
 define amdgpu_ps void @flat_prefetch_se(ptr %ptr) {
 ; GCN-LABEL: flat_prefetch_se:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_prefetch_b8 v[0:1] scope:SCOPE_SE
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -72,6 +78,7 @@ entry:
 define amdgpu_ps void @flat_prefetch_se_nt(ptr %ptr) {
 ; GCN-LABEL: flat_prefetch_se_nt:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_prefetch_b8 v[0:1] th:TH_LOAD_NT scope:SCOPE_SE
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -82,6 +89,7 @@ entry:
 define amdgpu_ps void @flat_prefetch_dev_ht(ptr %ptr) {
 ; GCN-LABEL: flat_prefetch_dev_ht:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_prefetch_b8 v[0:1] th:TH_LOAD_HT scope:SCOPE_DEV
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -92,6 +100,7 @@ entry:
 define amdgpu_ps void @flat_prefetch_sys_lu(ptr %ptr) {
 ; GCN-LABEL: flat_prefetch_sys_lu:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_prefetch_b8 v[0:1] th:TH_LOAD_BYPASS scope:SCOPE_SYS
 ; GCN-NEXT:    s_endpgm
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
index d5fae1e4a9657..bf7cce9877f86 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.async.to.lds.ll
@@ -10,12 +10,14 @@ declare void @llvm.amdgcn.global.load.async.to.lds.b128(ptr addrspace(1) %gaddr,
 define amdgpu_ps void @global_load_async_to_lds_b8_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: global_load_async_to_lds_b8_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    global_load_async_to_lds_b8 v2, v[0:1], off offset:16 th:TH_LOAD_NT
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: global_load_async_to_lds_b8_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -30,6 +32,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b8_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-LABEL: global_load_async_to_lds_b8_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    global_load_async_to_lds_b8 v0, v1, s[0:1] offset:16
 ; GFX1250-NEXT:    s_endpgm
@@ -42,12 +45,14 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b32_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: global_load_async_to_lds_b32_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    global_load_async_to_lds_b32 v2, v[0:1], off offset:16 th:TH_LOAD_HT scope:SCOPE_SE
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: global_load_async_to_lds_b32_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -62,6 +67,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b32_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-LABEL: global_load_async_to_lds_b32_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    global_load_async_to_lds_b32 v0, v1, s[0:1] offset:16
 ; GFX1250-NEXT:    s_endpgm
@@ -74,12 +80,14 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b64_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: global_load_async_to_lds_b64_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    global_load_async_to_lds_b64 v2, v[0:1], off offset:16 th:TH_LOAD_NT_HT scope:SCOPE_DEV
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: global_load_async_to_lds_b64_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -94,6 +102,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b64_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-LABEL: global_load_async_to_lds_b64_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    global_load_async_to_lds_b64 v0, v1, s[0:1] offset:16
 ; GFX1250-NEXT:    s_endpgm
@@ -106,12 +115,14 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b128_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: global_load_async_to_lds_b128_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    global_load_async_to_lds_b128 v2, v[0:1], off offset:16 th:TH_LOAD_BYPASS scope:SCOPE_SYS
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: global_load_async_to_lds_b128_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -126,6 +137,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b128_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-LABEL: global_load_async_to_lds_b128_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    global_load_async_to_lds_b128 v0, v1, s[0:1] offset:16
 ; GFX1250-NEXT:    s_endpgm
@@ -138,6 +150,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b32_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
 ; GFX1250-LABEL: global_load_async_to_lds_b32_saddr_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_async_to_lds_b32 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -150,6 +163,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b64_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
 ; GFX1250-LABEL: global_load_async_to_lds_b64_saddr_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_async_to_lds_b64 v0, v1, s[0:1] offset:16 scale_offset th:TH_LOAD_NT
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -162,6 +176,7 @@ entry:
 define amdgpu_ps void @global_load_async_to_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
 ; GFX1250-LABEL: global_load_async_to_lds_b64_saddr_no_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_ashrrev_i32_e32 v3, 31, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
index 80f9eeb25ebc0..25989990c50de 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.prefetch.ll
@@ -7,6 +7,7 @@ declare void @llvm.amdgcn.global.prefetch(ptr addrspace(1) %ptr, i32 %col)
 define amdgpu_ps void @global_prefetch(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: global_prefetch:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_prefetch_b8 v[0:1], off
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -17,6 +18,7 @@ entry:
 define amdgpu_ps void @global_prefetch_sgpr(ptr addrspace(1) inreg %ptr) {
 ; GCN-LABEL: global_prefetch_sgpr:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    global_prefetch_b8 v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
@@ -28,6 +30,7 @@ entry:
 define amdgpu_ps void @global_prefetch_offset(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: global_prefetch_offset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_prefetch_b8 v[0:1], off offset:512
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -39,6 +42,7 @@ entry:
 define amdgpu_ps void @global_prefetch_sgpr_voffset(ptr addrspace(1) inreg %ptr, i32 %offset) {
 ; GCN-LABEL: global_prefetch_sgpr_voffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_prefetch_b8 v0, s[0:1]
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -50,6 +54,7 @@ entry:
 define amdgpu_ps void @global_prefetch_sgpr_voffset_offset(ptr addrspace(1) inreg %ptr, i32 %offset) {
 ; GCN-LABEL: global_prefetch_sgpr_voffset_offset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_prefetch_b8 v0, s[0:1] offset:128
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -62,6 +67,7 @@ entry:
 define amdgpu_ps void @global_prefetch_se(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: global_prefetch_se:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_prefetch_b8 v[0:1], off scope:SCOPE_SE
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -72,6 +78,7 @@ entry:
 define amdgpu_ps void @global_prefetch_se_nt(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: global_prefetch_se_nt:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_prefetch_b8 v[0:1], off th:TH_LOAD_NT scope:SCOPE_SE
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -82,6 +89,7 @@ entry:
 define amdgpu_ps void @global_prefetch_dev_ht(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: global_prefetch_dev_ht:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_prefetch_b8 v[0:1], off th:TH_LOAD_HT scope:SCOPE_DEV
 ; GCN-NEXT:    s_endpgm
 entry:
@@ -92,6 +100,7 @@ entry:
 define amdgpu_ps void @global_prefetch_sys_lu(ptr addrspace(1) %ptr) {
 ; GCN-LABEL: global_prefetch_sys_lu:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_prefetch_b8 v[0:1], off th:TH_LOAD_BYPASS scope:SCOPE_SYS
 ; GCN-NEXT:    s_endpgm
 entry:

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll
index 22563f8e5ff46..2e964fd8fb2c7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.store.async.from.lds.ll
@@ -10,12 +10,14 @@ declare void @llvm.amdgcn.global.store.async.from.lds.b128(ptr addrspace(1) %gad
 define amdgpu_ps void @global_store_async_from_lds_b8_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: global_store_async_from_lds_b8_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    global_store_async_from_lds_b8 v[0:1], v2, off offset:16 th:TH_STORE_NT
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: global_store_async_from_lds_b8_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -30,6 +32,7 @@ entry:
 define amdgpu_ps void @global_store_async_from_lds_b8_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-LABEL: global_store_async_from_lds_b8_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    global_store_async_from_lds_b8 v1, v0, s[0:1] offset:16
 ; GFX1250-NEXT:    s_endpgm
@@ -42,12 +45,14 @@ entry:
 define amdgpu_ps void @global_store_async_from_lds_b32(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: global_store_async_from_lds_b32:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    global_store_async_from_lds_b32 v[0:1], v2, off offset:16 th:TH_STORE_HT scope:SCOPE_SE
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: global_store_async_from_lds_b32:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -62,6 +67,7 @@ entry:
 define amdgpu_ps void @global_store_async_from_lds_b32_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-LABEL: global_store_async_from_lds_b32_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    global_store_async_from_lds_b32 v1, v0, s[0:1] offset:16
 ; GFX1250-NEXT:    s_endpgm
@@ -74,12 +80,14 @@ entry:
 define amdgpu_ps void @global_store_async_from_lds_b64_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: global_store_async_from_lds_b64_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    global_store_async_from_lds_b64 v[0:1], v2, off offset:16 th:TH_STORE_NT_HT scope:SCOPE_DEV
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: global_store_async_from_lds_b64_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -94,6 +102,7 @@ entry:
 define amdgpu_ps void @global_store_async_from_lds_b64_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-LABEL: global_store_async_from_lds_b64_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    global_store_async_from_lds_b64 v1, v0, s[0:1] offset:16
 ; GFX1250-NEXT:    s_endpgm
@@ -106,12 +115,14 @@ entry:
 define amdgpu_ps void @global_store_async_from_lds_b128_vaddr(ptr addrspace(1) %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-SDAG-LABEL: global_store_async_from_lds_b128_vaddr:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_add_nc_u64_e32 v[0:1], 32, v[0:1]
 ; GFX1250-SDAG-NEXT:    global_store_async_from_lds_b128 v[0:1], v2, off offset:16 th:TH_STORE_BYPASS scope:SCOPE_SYS
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: global_store_async_from_lds_b128_vaddr:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_add_co_u32 v0, vcc_lo, v0, 32
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-GISEL-NEXT:    v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
@@ -126,6 +137,7 @@ entry:
 define amdgpu_ps void @global_store_async_from_lds_b128_saddr(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr) {
 ; GFX1250-LABEL: global_store_async_from_lds_b128_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 32
 ; GFX1250-NEXT:    global_store_async_from_lds_b128 v1, v0, s[0:1] offset:16
 ; GFX1250-NEXT:    s_endpgm
@@ -138,6 +150,7 @@ entry:
 define amdgpu_ps void @global_store_async_from_lds_b32_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
 ; GFX1250-LABEL: global_store_async_from_lds_b32_saddr_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_store_async_from_lds_b32 v1, v0, s[0:1] offset:16 scale_offset th:TH_STORE_NT
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -150,6 +163,7 @@ entry:
 define amdgpu_ps void @global_store_async_from_lds_b64_saddr_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
 ; GFX1250-LABEL: global_store_async_from_lds_b64_saddr_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_store_async_from_lds_b64 v1, v0, s[0:1] offset:16 scale_offset th:TH_STORE_NT
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -162,6 +176,7 @@ entry:
 define amdgpu_ps void @global_store_async_from_lds_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %gaddr, ptr addrspace(3) %laddr, i32 %idx) {
 ; GFX1250-LABEL: global_store_async_from_lds_b64_saddr_no_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, v1
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_ashrrev_i32_e32 v3, 31, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
index 5b36d4cefa2e3..7de4aa95b3cc2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.private.ll
@@ -61,6 +61,7 @@ define amdgpu_kernel void @is_private_vgpr(ptr addrspace(1) %ptr.ptr) {
 ;
 ; GFX1250-LABEL: is_private_vgpr:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -187,6 +188,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ;
 ; GFX1250-SDAG-LABEL: is_private_sgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s0, s0, src_flat_scratch_base_hi
@@ -264,6 +266,7 @@ define amdgpu_kernel void @is_private_sgpr(ptr %ptr) {
 ;
 ; GFX1250-GISEL-LABEL: is_private_sgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    s_xor_b32 s0, s1, src_flat_scratch_base_hi

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
index 7aecae901becf..3aa1a10ffc918 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.is.shared.ll
@@ -94,6 +94,7 @@ define amdgpu_kernel void @is_local_vgpr(ptr addrspace(1) %ptr.ptr) {
 ;
 ; GFX1250-LABEL: is_local_vgpr:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_mov_b64 s[0:1], src_shared_base
@@ -253,6 +254,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ;
 ; GFX1250-SDAG-LABEL: is_local_sgpr:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX1250-SDAG-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -330,6 +332,7 @@ define amdgpu_kernel void @is_local_sgpr(ptr %ptr) {
 ;
 ; GFX1250-GISEL-LABEL: is_local_sgpr:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-GISEL-NEXT:    s_mov_b64 s[0:1], src_shared_base
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll
index 350d468344f65..910c55a041ede 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.monitor.gfx1250.ll
@@ -12,6 +12,7 @@ declare <4 x i32> @llvm.amdgcn.flat.load.monitor.b128.v4i32(ptr, i32)
 define amdgpu_ps void @global_load_monitor_b32_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_monitor_b32_vaddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_monitor_b32 v0, v[0:1], off offset:32 th:TH_LOAD_NT
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    global_store_b32 v[2:3], v0, off
@@ -26,6 +27,7 @@ entry:
 define amdgpu_ps void @global_load_monitor_b32_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_monitor_b32_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    global_load_monitor_b32 v2, v2, s[0:1] offset:32 th:TH_LOAD_HT scope:SCOPE_SE
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -41,6 +43,7 @@ entry:
 define amdgpu_ps void @global_load_monitor_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_monitor_b64_vaddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_monitor_b64 v[0:1], v[0:1], off offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    global_store_b64 v[2:3], v[0:1], off
@@ -55,6 +58,7 @@ entry:
 define amdgpu_ps void @global_load_monitor_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_monitor_b64_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    global_load_monitor_b64 v[2:3], v2, s[0:1] offset:32 th:TH_LOAD_BYPASS scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -70,6 +74,7 @@ entry:
 define amdgpu_ps void @global_load_monitor_b128_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_monitor_b128_vaddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_monitor_b128 v[4:7], v[0:1], off offset:32
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    global_store_b128 v[2:3], v[4:7], off
@@ -84,6 +89,7 @@ entry:
 define amdgpu_ps void @global_load_monitor_b128_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_monitor_b128_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    global_load_monitor_b128 v[2:5], v2, s[0:1] offset:32 th:TH_LOAD_NT
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -99,6 +105,7 @@ entry:
 define amdgpu_ps void @flat_load_monitor_b32(ptr %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: flat_load_monitor_b32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_monitor_b32 v0, v[0:1] offset:32 th:TH_LOAD_HT scope:SCOPE_SE
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    global_store_b32 v[2:3], v0, off
@@ -113,6 +120,7 @@ entry:
 define amdgpu_ps void @flat_load_monitor_b64(ptr %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: flat_load_monitor_b64:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_monitor_b64 v[0:1], v[0:1] offset:32 th:TH_LOAD_NT_HT scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    global_store_b64 v[2:3], v[0:1], off
@@ -127,6 +135,7 @@ entry:
 define amdgpu_ps void @flat_load_monitor_b128(ptr %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: flat_load_monitor_b128:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_load_monitor_b128 v[4:7], v[0:1] offset:32 th:TH_LOAD_BYPASS scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    global_store_b128 v[2:3], v[4:7], off
@@ -141,6 +150,7 @@ entry:
 define amdgpu_ps void @global_load_monitor_b32_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) {
 ; GFX1250-LABEL: global_load_monitor_b32_saddr_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_monitor_b32 v2, v2, s[0:1] scale_offset th:TH_LOAD_NT
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    global_store_b32 v[0:1], v2, off
@@ -156,6 +166,7 @@ entry:
 define amdgpu_ps void @global_load_monitor_b64_saddr_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) {
 ; GFX1250-LABEL: global_load_monitor_b64_saddr_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_monitor_b64 v[2:3], v2, s[0:1] scale_offset th:TH_LOAD_NT
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    global_store_b64 v[0:1], v[2:3], off
@@ -171,6 +182,7 @@ entry:
 define amdgpu_ps void @global_load_monitor_b64_saddr_no_scale_offset(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use, i32 %idx) {
 ; GFX1250-LABEL: global_load_monitor_b64_saddr_no_scale_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_lshl_add_u64 v[2:3], v[2:3], 2, s[0:1]

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
index f7ed5341141d4..68e9e2b1bdf5b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.load.tr.gfx1250.w32.ll
@@ -20,6 +20,7 @@ declare <8 x bfloat> @llvm.amdgcn.ds.load.tr16.b128.v8bf16.p3(ptr addrspace(3))
 define amdgpu_ps void @global_load_tr4_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_tr4_b64_vaddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_tr4_b64 v[0:1], v[0:1], off offset:32
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    global_store_b64 v[2:3], v[0:1], off
@@ -34,6 +35,7 @@ entry:
 define amdgpu_ps void @global_load_tr4_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_tr4_b64_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    global_load_tr4_b64 v[2:3], v2, s[0:1] offset:32
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -49,6 +51,7 @@ entry:
 define amdgpu_ps void @global_load_tr8_b64_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_tr8_b64_vaddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_tr8_b64 v[0:1], v[0:1], off offset:32
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    global_store_b64 v[2:3], v[0:1], off
@@ -63,6 +66,7 @@ entry:
 define amdgpu_ps void @global_load_tr8_b64_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_tr8_b64_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    global_load_tr8_b64 v[2:3], v2, s[0:1] offset:32
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -78,6 +82,7 @@ entry:
 define amdgpu_ps void @global_load_tr6_b96_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_tr6_b96_vaddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_tr6_b96 v[4:6], v[0:1], off offset:32
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    global_store_b96 v[2:3], v[4:6], off
@@ -92,6 +97,7 @@ entry:
 define amdgpu_ps void @global_load_tr6_b96_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_tr6_b96_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    global_load_tr6_b96 v[2:4], v2, s[0:1] offset:32
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -107,6 +113,7 @@ entry:
 define amdgpu_ps void @global_load_tr16_b128_v8i16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_tr16_b128_v8i16_vaddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_tr16_b128 v[4:7], v[0:1], off offset:32
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    global_store_b128 v[2:3], v[4:7], off
@@ -121,6 +128,7 @@ entry:
 define amdgpu_ps void @global_load_tr16_b128_v8i16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_tr16_b128_v8i16_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -136,6 +144,7 @@ entry:
 define amdgpu_ps void @global_load_tr16_b128_v8f16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_tr16_b128_v8f16_vaddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_tr16_b128 v[4:7], v[0:1], off offset:32
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    global_store_b128 v[2:3], v[4:7], off
@@ -150,6 +159,7 @@ entry:
 define amdgpu_ps void @global_load_tr16_b128_v8f16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_tr16_b128_v8f16_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -165,6 +175,7 @@ entry:
 define amdgpu_ps void @global_load_tr16_b128_v8b16_vaddr(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_tr16_b128_v8b16_vaddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_tr16_b128 v[4:7], v[0:1], off offset:32
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    global_store_b128 v[2:3], v[4:7], off
@@ -179,6 +190,7 @@ entry:
 define amdgpu_ps void @global_load_tr16_b128_v8bf16_saddr(ptr addrspace(1) inreg %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: global_load_tr16_b128_v8bf16_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    global_load_tr16_b128 v[2:5], v2, s[0:1] offset:32
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -194,6 +206,7 @@ entry:
 define amdgpu_ps void @ds_load_tr4_b64(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
 ; GFX1250-SDAG-LABEL: ds_load_tr4_b64:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    ds_load_tr4_b64 v[0:1], v0 offset:32
 ; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
@@ -202,6 +215,7 @@ define amdgpu_ps void @ds_load_tr4_b64(ptr addrspace(3) %addr, ptr addrspace(1)
 ;
 ; GFX1250-GISEL-LABEL: ds_load_tr4_b64:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    ds_load_tr4_b64 v[0:1], v0 offset:32
 ; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
@@ -217,6 +231,7 @@ entry:
 define amdgpu_ps void @ds_load_tr8_b64(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
 ; GFX1250-SDAG-LABEL: ds_load_tr8_b64:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    ds_load_tr8_b64 v[0:1], v0 offset:32
 ; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
@@ -225,6 +240,7 @@ define amdgpu_ps void @ds_load_tr8_b64(ptr addrspace(3) %addr, ptr addrspace(1)
 ;
 ; GFX1250-GISEL-LABEL: ds_load_tr8_b64:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    ds_load_tr8_b64 v[0:1], v0 offset:32
 ; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
@@ -240,6 +256,7 @@ entry:
 define amdgpu_ps void @ds_load_tr6_b96(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
 ; GFX1250-SDAG-LABEL: ds_load_tr6_b96:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
 ; GFX1250-SDAG-NEXT:    ds_load_tr6_b96 v[0:2], v0 offset:32
 ; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
@@ -248,6 +265,7 @@ define amdgpu_ps void @ds_load_tr6_b96(ptr addrspace(3) %addr, ptr addrspace(1)
 ;
 ; GFX1250-GISEL-LABEL: ds_load_tr6_b96:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    ds_load_tr6_b96 v[0:2], v0 offset:32
 ; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
@@ -263,6 +281,7 @@ entry:
 define amdgpu_ps void @ds_load_tr16_b128_v8i16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
 ; GFX1250-SDAG-LABEL: ds_load_tr16_b128_v8i16:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
 ; GFX1250-SDAG-NEXT:    ds_load_tr16_b128 v[0:3], v0 offset:32
 ; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
@@ -271,6 +290,7 @@ define amdgpu_ps void @ds_load_tr16_b128_v8i16(ptr addrspace(3) %addr, ptr addrs
 ;
 ; GFX1250-GISEL-LABEL: ds_load_tr16_b128_v8i16:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    ds_load_tr16_b128 v[0:3], v0 offset:32
 ; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
@@ -286,6 +306,7 @@ entry:
 define amdgpu_ps void @ds_load_tr16_b128_v8f16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
 ; GFX1250-SDAG-LABEL: ds_load_tr16_b128_v8f16:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
 ; GFX1250-SDAG-NEXT:    ds_load_tr16_b128 v[0:3], v0 offset:32
 ; GFX1250-SDAG-NEXT:    s_wait_dscnt 0x0
@@ -294,6 +315,7 @@ define amdgpu_ps void @ds_load_tr16_b128_v8f16(ptr addrspace(3) %addr, ptr addrs
 ;
 ; GFX1250-GISEL-LABEL: ds_load_tr16_b128_v8f16:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    ds_load_tr16_b128 v[0:3], v0 offset:32
 ; GFX1250-GISEL-NEXT:    s_wait_dscnt 0x0
@@ -309,6 +331,7 @@ entry:
 define amdgpu_ps void @ds_load_tr16_b128_v8bf16(ptr addrspace(3) %addr, ptr addrspace(1) %use) {
 ; GFX1250-LABEL: ds_load_tr16_b128_v8bf16:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1
 ; GFX1250-NEXT:    ds_load_tr16_b128 v[0:3], v0 offset:32
 ; GFX1250-NEXT:    s_wait_dscnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll
index 42a50bb304bc9..1095c39d257f2 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane.gfx1250.ll
@@ -5,6 +5,7 @@
 define amdgpu_kernel void @v_permlane_bcast_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
 ; GFX1250-LABEL: v_permlane_bcast_b32_vss:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b32 s6, s[4:5], 0x34
@@ -22,6 +23,7 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vss(ptr addrspace(1) %out, i32 %
 define amdgpu_kernel void @v_permlane_bcast_b32_vii(ptr addrspace(1) %out, i32 %src0) {
 ; GFX1250-LABEL: v_permlane_bcast_b32_vii:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
@@ -37,6 +39,7 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vii(ptr addrspace(1) %out, i32 %
 define amdgpu_kernel void @v_permlane_bcast_b32_vll(ptr addrspace(1) %out, i32 %src0) {
 ; GFX1250-LABEL: v_permlane_bcast_b32_vll:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
@@ -53,6 +56,7 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vll(ptr addrspace(1) %out, i32 %
 define amdgpu_kernel void @v_permlane_bcast_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
 ; GFX1250-SDAG-LABEL: v_permlane_bcast_b32_vvv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
@@ -69,6 +73,7 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vvv(ptr addrspace(1) %out, i32 %
 ;
 ; GFX1250-GISEL-LABEL: v_permlane_bcast_b32_vvv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
@@ -92,6 +97,7 @@ define amdgpu_kernel void @v_permlane_bcast_b32_vvv(ptr addrspace(1) %out, i32 %
 define amdgpu_kernel void @v_permlane_down_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
 ; GFX1250-LABEL: v_permlane_down_b32_vss:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b32 s6, s[4:5], 0x34
@@ -109,6 +115,7 @@ define amdgpu_kernel void @v_permlane_down_b32_vss(ptr addrspace(1) %out, i32 %s
 define amdgpu_kernel void @v_permlane_down_b32_vii(ptr addrspace(1) %out, i32 %src0) {
 ; GFX1250-LABEL: v_permlane_down_b32_vii:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
@@ -124,6 +131,7 @@ define amdgpu_kernel void @v_permlane_down_b32_vii(ptr addrspace(1) %out, i32 %s
 define amdgpu_kernel void @v_permlane_down_b32_vll(ptr addrspace(1) %out, i32 %src0) {
 ; GFX1250-LABEL: v_permlane_down_b32_vll:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
@@ -140,6 +148,7 @@ define amdgpu_kernel void @v_permlane_down_b32_vll(ptr addrspace(1) %out, i32 %s
 define amdgpu_kernel void @v_permlane_down_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
 ; GFX1250-SDAG-LABEL: v_permlane_down_b32_vvv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
@@ -156,6 +165,7 @@ define amdgpu_kernel void @v_permlane_down_b32_vvv(ptr addrspace(1) %out, i32 %s
 ;
 ; GFX1250-GISEL-LABEL: v_permlane_down_b32_vvv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
@@ -179,6 +189,7 @@ define amdgpu_kernel void @v_permlane_down_b32_vvv(ptr addrspace(1) %out, i32 %s
 define amdgpu_kernel void @v_permlane_up_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
 ; GFX1250-LABEL: v_permlane_up_b32_vss:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b32 s6, s[4:5], 0x34
@@ -196,6 +207,7 @@ define amdgpu_kernel void @v_permlane_up_b32_vss(ptr addrspace(1) %out, i32 %src
 define amdgpu_kernel void @v_permlane_up_b32_vii(ptr addrspace(1) %out, i32 %src0) {
 ; GFX1250-LABEL: v_permlane_up_b32_vii:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
@@ -211,6 +223,7 @@ define amdgpu_kernel void @v_permlane_up_b32_vii(ptr addrspace(1) %out, i32 %src
 define amdgpu_kernel void @v_permlane_up_b32_vll(ptr addrspace(1) %out, i32 %src0) {
 ; GFX1250-LABEL: v_permlane_up_b32_vll:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
@@ -227,6 +240,7 @@ define amdgpu_kernel void @v_permlane_up_b32_vll(ptr addrspace(1) %out, i32 %src
 define amdgpu_kernel void @v_permlane_up_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
 ; GFX1250-SDAG-LABEL: v_permlane_up_b32_vvv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
@@ -243,6 +257,7 @@ define amdgpu_kernel void @v_permlane_up_b32_vvv(ptr addrspace(1) %out, i32 %src
 ;
 ; GFX1250-GISEL-LABEL: v_permlane_up_b32_vvv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
@@ -266,6 +281,7 @@ define amdgpu_kernel void @v_permlane_up_b32_vvv(ptr addrspace(1) %out, i32 %src
 define amdgpu_kernel void @v_permlane_xor_b32_vss(ptr addrspace(1) %out, i32 %src0, i32 %src1, i32 %src2) {
 ; GFX1250-LABEL: v_permlane_xor_b32_vss:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b32 s6, s[4:5], 0x34
@@ -283,6 +299,7 @@ define amdgpu_kernel void @v_permlane_xor_b32_vss(ptr addrspace(1) %out, i32 %sr
 define amdgpu_kernel void @v_permlane_xor_b32_vii(ptr addrspace(1) %out, i32 %src0) {
 ; GFX1250-LABEL: v_permlane_xor_b32_vii:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
@@ -298,6 +315,7 @@ define amdgpu_kernel void @v_permlane_xor_b32_vii(ptr addrspace(1) %out, i32 %sr
 define amdgpu_kernel void @v_permlane_xor_b32_vll(ptr addrspace(1) %out, i32 %src0) {
 ; GFX1250-LABEL: v_permlane_xor_b32_vll:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
@@ -314,6 +332,7 @@ define amdgpu_kernel void @v_permlane_xor_b32_vll(ptr addrspace(1) %out, i32 %sr
 define amdgpu_kernel void @v_permlane_xor_b32_vvv(ptr addrspace(1) %out, i32 %src0) {
 ; GFX1250-SDAG-LABEL: v_permlane_xor_b32_vvv:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    v_bfe_u32 v0, v0, 10, 10
@@ -330,6 +349,7 @@ define amdgpu_kernel void @v_permlane_xor_b32_vvv(ptr addrspace(1) %out, i32 %sr
 ;
 ; GFX1250-GISEL-LABEL: v_permlane_xor_b32_vvv:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v1, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    v_bfe_u32 v0, v0, 10, 10
@@ -353,6 +373,7 @@ define amdgpu_kernel void @v_permlane_xor_b32_vvv(ptr addrspace(1) %out, i32 %sr
 define amdgpu_kernel void @v_permlane_idx_gen_b32_vs(ptr addrspace(1) %out, i32 %src0, i32 %src1) {
 ; GFX1250-LABEL: v_permlane_idx_gen_b32_vs:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
@@ -368,6 +389,7 @@ define amdgpu_kernel void @v_permlane_idx_gen_b32_vs(ptr addrspace(1) %out, i32
 define amdgpu_kernel void @v_permlane_idx_gen_b32_vi(ptr addrspace(1) %out, i32 %src0) {
 ; GFX1250-LABEL: v_permlane_idx_gen_b32_vi:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
@@ -383,6 +405,7 @@ define amdgpu_kernel void @v_permlane_idx_gen_b32_vi(ptr addrspace(1) %out, i32
 define amdgpu_kernel void @v_permlane_idx_gen_b32_vl(ptr addrspace(1) %out, i32 %src0) {
 ; GFX1250-LABEL: v_permlane_idx_gen_b32_vl:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
@@ -398,6 +421,7 @@ define amdgpu_kernel void @v_permlane_idx_gen_b32_vl(ptr addrspace(1) %out, i32
 define amdgpu_kernel void @v_permlane_idx_gen_b32_vv(ptr addrspace(1) %out) {
 ; GFX1250-LABEL: v_permlane_idx_gen_b32_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_bfe_u32 v1, v0, 10, 10
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
index dc9e23b0a457d..b0f1caefa5cbd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.atomic.buffer.load.ll
@@ -31,6 +31,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32(<4 x i32> %addr) {
 ;
 ; GFX12-LABEL: raw_atomic_buffer_load_i32:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_xcnt 0x0
@@ -79,6 +80,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_off(<4 x i32> %addr) {
 ;
 ; GFX12-LABEL: raw_atomic_buffer_load_i32_off:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_xcnt 0x0
@@ -126,6 +128,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_soff(<4 x i32> %addr) {
 ;
 ; GFX12-LABEL: raw_atomic_buffer_load_i32_soff:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_xcnt 0x0
@@ -174,6 +177,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i32_dlc(<4 x i32> %addr) {
 ;
 ; GFX12-LABEL: raw_atomic_buffer_load_i32_dlc:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_xcnt 0x0
@@ -223,6 +227,7 @@ define amdgpu_kernel void @raw_nonatomic_buffer_load_i32(<4 x i32> %addr) {
 ;
 ; GFX12-LABEL: raw_nonatomic_buffer_load_i32:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
@@ -272,6 +277,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) {
 ;
 ; GFX12-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_i64:
 ; GFX12-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
@@ -292,6 +298,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_i64(<4 x i32> %addr) {
 ;
 ; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_i64:
 ; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
@@ -342,6 +349,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v2i16(<4 x i32> %addr) {
 ;
 ; GFX12-LABEL: raw_atomic_buffer_load_v2i16:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_xcnt 0x0
@@ -453,6 +461,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
 ;
 ; GFX12-SDAG-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
 ; GFX12-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_xcnt 0x0
@@ -473,6 +482,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
 ;
 ; GFX12-FAKE16-LABEL: raw_atomic_buffer_load_v4i16:
 ; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
@@ -495,6 +505,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i16(<4 x i32> %addr) {
 ;
 ; GFX12-GISEL-TRUE16-LABEL: raw_atomic_buffer_load_v4i16:
 ; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_xcnt 0x0
@@ -546,6 +557,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_v4i32(<4 x i32> %addr) {
 ;
 ; GFX12-LABEL: raw_atomic_buffer_load_v4i32:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_xcnt 0x0
@@ -597,6 +609,7 @@ define amdgpu_kernel void @raw_atomic_buffer_load_ptr(<4 x i32> %addr) {
 ;
 ; GFX12-LABEL: raw_atomic_buffer_load_ptr:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_xcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
index 1a1a1f784464d..77fb0ed96d761 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.load.tfe.ll
@@ -5,8 +5,8 @@
 ; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX910,GFX9
 ; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX910,GFX10
 ; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX11
-; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12
-; RUN: llc -mcpu=gfx1250 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX1200
+; RUN: llc -mcpu=gfx1250 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX1250
 
 define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
 ; GFX67-LABEL: raw_buffer_load_i8_tfe:
@@ -54,16 +54,28 @@ define amdgpu_ps void @raw_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspa
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: raw_buffer_load_i8_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v5, v4
-; GFX12-NEXT:    buffer_load_u8 v[4:5], off, s[0:3], null tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v[0:1], v4, off
-; GFX12-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: raw_buffer_load_i8_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1200-NEXT:    buffer_load_u8 v[4:5], off, s[0:3], null tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b8 v[0:1], v4, off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_load_i8_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1250-NEXT:    buffer_load_u8 v[4:5], off, s[0:3], null tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b8 v[0:1], v4, off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { i8, i32 } @llvm.amdgcn.raw.buffer.load.sl_i8i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { i8, i32 } %res, 0
   store i8 %data, ptr addrspace(1) %data_addr
@@ -118,16 +130,28 @@ define amdgpu_ps void @raw_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: raw_buffer_load_i16_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v5, v4
-; GFX12-NEXT:    buffer_load_u16 v[4:5], off, s[0:3], null tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b16 v[0:1], v4, off
-; GFX12-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: raw_buffer_load_i16_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1200-NEXT:    buffer_load_u16 v[4:5], off, s[0:3], null tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_load_i16_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1250-NEXT:    buffer_load_u16 v[4:5], off, s[0:3], null tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { i16, i32 } @llvm.amdgcn.raw.buffer.load.sl_i16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { i16, i32 } %res, 0
   store i16 %data, ptr addrspace(1) %data_addr
@@ -182,16 +206,28 @@ define amdgpu_ps void @raw_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr addrsp
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: raw_buffer_load_f16_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v5, v4
-; GFX12-NEXT:    buffer_load_u16 v[4:5], off, s[0:3], null tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b16 v[0:1], v4, off
-; GFX12-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: raw_buffer_load_f16_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1200-NEXT:    buffer_load_u16 v[4:5], off, s[0:3], null tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_load_f16_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1250-NEXT:    buffer_load_u16 v[4:5], off, s[0:3], null tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { half, i32 } @llvm.amdgcn.raw.buffer.load.sl_f16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { half, i32 } %res, 0
   store half %data, ptr addrspace(1) %data_addr
@@ -246,16 +282,28 @@ define amdgpu_ps void @raw_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrsp
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: raw_buffer_load_i32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v5, v4
-; GFX12-NEXT:    buffer_load_b32 v[4:5], off, s[0:3], null tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b32 v[0:1], v4, off
-; GFX12-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: raw_buffer_load_i32_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1200-NEXT:    buffer_load_b32 v[4:5], off, s[0:3], null tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_load_i32_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1250-NEXT:    buffer_load_b32 v[4:5], off, s[0:3], null tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { i32, i32 } @llvm.amdgcn.raw.buffer.load.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { i32, i32 } %res, 0
   store i32 %data, ptr addrspace(1) %data_addr
@@ -330,16 +378,28 @@ define amdgpu_ps void @raw_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX11-NEXT:    global_store_b32 v[2:3], v6, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: raw_buffer_load_v2i32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    buffer_load_b64 v[4:6], off, s[0:3], null tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b64 v[0:1], v[4:5], off
-; GFX12-NEXT:    global_store_b32 v[2:3], v6, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: raw_buffer_load_v2i32_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1200-NEXT:    buffer_load_b64 v[4:6], off, s[0:3], null tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b64 v[0:1], v[4:5], off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v6, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_load_v2i32_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1250-NEXT:    buffer_load_b64 v[4:6], off, s[0:3], null tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b64 v[0:1], v[4:5], off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v6, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { <2 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { <2 x i32>, i32 } %res, 0
   store <2 x i32> %data, ptr addrspace(1) %data_addr
@@ -414,16 +474,28 @@ define amdgpu_ps void @raw_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX11-NEXT:    global_store_b32 v[2:3], v6, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: raw_buffer_load_v2f32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    buffer_load_b64 v[4:6], off, s[0:3], null tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b64 v[0:1], v[4:5], off
-; GFX12-NEXT:    global_store_b32 v[2:3], v6, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: raw_buffer_load_v2f32_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1200-NEXT:    buffer_load_b64 v[4:6], off, s[0:3], null tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b64 v[0:1], v[4:5], off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v6, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_load_v2f32_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1250-NEXT:    buffer_load_b64 v[4:6], off, s[0:3], null tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b64 v[0:1], v[4:5], off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v6, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { <2 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { <2 x float>, i32 } %res, 0
   store <2 x float> %data, ptr addrspace(1) %data_addr
@@ -503,17 +575,30 @@ define amdgpu_ps void @raw_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX11-NEXT:    global_store_b32 v[2:3], v7, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: raw_buffer_load_v3i32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_mov_b32_e32 v7, v4
-; GFX12-NEXT:    buffer_load_b96 v[4:7], off, s[0:3], null tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b96 v[0:1], v[4:6], off
-; GFX12-NEXT:    global_store_b32 v[2:3], v7, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: raw_buffer_load_v3i32_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1200-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1200-NEXT:    buffer_load_b96 v[4:7], off, s[0:3], null tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b96 v[0:1], v[4:6], off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v7, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_load_v3i32_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1250-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1250-NEXT:    buffer_load_b96 v[4:7], off, s[0:3], null tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b96 v[0:1], v[4:6], off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v7, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { <3 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { <3 x i32>, i32 } %res, 0
   store <3 x i32> %data, ptr addrspace(1) %data_addr
@@ -593,17 +678,30 @@ define amdgpu_ps void @raw_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX11-NEXT:    global_store_b32 v[2:3], v7, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: raw_buffer_load_v3f32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_mov_b32_e32 v7, v4
-; GFX12-NEXT:    buffer_load_b96 v[4:7], off, s[0:3], null tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b96 v[0:1], v[4:6], off
-; GFX12-NEXT:    global_store_b32 v[2:3], v7, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: raw_buffer_load_v3f32_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1200-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1200-NEXT:    buffer_load_b96 v[4:7], off, s[0:3], null tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b96 v[0:1], v[4:6], off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v7, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_load_v3f32_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1250-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1250-NEXT:    buffer_load_b96 v[4:7], off, s[0:3], null tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b96 v[0:1], v[4:6], off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v7, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { <3 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { <3 x float>, i32 } %res, 0
   store <3 x float> %data, ptr addrspace(1) %data_addr
@@ -670,17 +768,30 @@ define amdgpu_ps void @raw_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX11-NEXT:    global_store_b32 v[2:3], v8, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: raw_buffer_load_v4i32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
-; GFX12-NEXT:    buffer_load_b128 v[4:8], off, s[0:3], null tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b128 v[0:1], v[4:7], off
-; GFX12-NEXT:    global_store_b32 v[2:3], v8, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: raw_buffer_load_v4i32_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1200-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
+; GFX1200-NEXT:    buffer_load_b128 v[4:8], off, s[0:3], null tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v8, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_load_v4i32_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1250-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
+; GFX1250-NEXT:    buffer_load_b128 v[4:8], off, s[0:3], null tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v8, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { <4 x i32>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { <4 x i32>, i32 } %res, 0
   store <4 x i32> %data, ptr addrspace(1) %data_addr
@@ -747,17 +858,30 @@ define amdgpu_ps void @raw_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX11-NEXT:    global_store_b32 v[2:3], v8, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: raw_buffer_load_v4f32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
-; GFX12-NEXT:    buffer_load_b128 v[4:8], off, s[0:3], null tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b128 v[0:1], v[4:7], off
-; GFX12-NEXT:    global_store_b32 v[2:3], v8, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: raw_buffer_load_v4f32_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1200-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
+; GFX1200-NEXT:    buffer_load_b128 v[4:8], off, s[0:3], null tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v8, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_load_v4f32_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1250-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
+; GFX1250-NEXT:    buffer_load_b128 v[4:8], off, s[0:3], null tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v8, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { <4 x float>, i32 } @llvm.amdgcn.raw.buffer.load.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0)
   %data = extractvalue { <4 x float>, i32 } %res, 0
   store <4 x float> %data, ptr addrspace(1) %data_addr

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
index eeea1456792af..f770133e3559f 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.buffer.store.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=verde | FileCheck -check-prefixes=GFX68,VERDE %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=tonga | FileCheck -check-prefixes=GFX68,GFX8 %s
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GFX12 %s
-; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 | FileCheck -check-prefixes=GFX12 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 | FileCheck -check-prefixes=GFX1200 %s
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 | FileCheck -check-prefixes=GFX1250 %s
 
 define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <4 x float>) {
 ; GFX68-LABEL: buffer_store:
@@ -20,6 +20,23 @@ define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <
 ; GFX11-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], 0 glc
 ; GFX11-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], 0 slc
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store:
+; GFX1200:       ; %bb.0: ; %main_body
+; GFX1200-NEXT:    s_clause 0x2
+; GFX1200-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null
+; GFX1200-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], null th:TH_STORE_NT
+; GFX1200-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], null th:TH_STORE_HT
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store:
+; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_clause 0x2
+; GFX1250-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null
+; GFX1250-NEXT:    buffer_store_b128 v[4:7], off, s[0:3], null th:TH_STORE_NT
+; GFX1250-NEXT:    buffer_store_b128 v[8:11], off, s[0:3], null th:TH_STORE_HT
+; GFX1250-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %2, <4 x i32> %0, i32 0, i32 0, i32 1)
@@ -37,6 +54,17 @@ define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:42
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_immoffs:
+; GFX1200:       ; %bb.0: ; %main_body
+; GFX1200-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null offset:42
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_immoffs:
+; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null offset:42
+; GFX1250-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 42, i32 0, i32 0)
   ret void
@@ -52,6 +80,17 @@ define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], 0 offen
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_ofs:
+; GFX1200:       ; %bb.0: ; %main_body
+; GFX1200-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], null offen
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_ofs:
+; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], null offen
+; GFX1250-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0)
   ret void
@@ -83,6 +122,24 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32,
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], v6, s[0:3], 0 offen
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_wait:
+; GFX1200:       ; %bb.0: ; %main_body
+; GFX1200-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], null offen
+; GFX1200-NEXT:    buffer_load_b128 v[0:3], v5, s[0:3], null offen
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    buffer_store_b128 v[0:3], v6, s[0:3], null offen
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_wait:
+; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], null offen
+; GFX1250-NEXT:    buffer_load_b128 v[0:3], v5, s[0:3], null offen
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    buffer_store_b128 v[0:3], v6, s[0:3], null offen
+; GFX1250-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %1, <4 x i32> %0, i32 %2, i32 0, i32 0)
   %data = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> %0, i32 %3, i32 0, i32 0)
@@ -100,6 +157,17 @@ define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 offen
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_x1:
+; GFX1200:       ; %bb.0: ; %main_body
+; GFX1200-NEXT:    buffer_store_b32 v0, v1, s[0:3], null offen
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_x1:
+; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    buffer_store_b32 v0, v1, s[0:3], null offen
+; GFX1250-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.f32(float %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
   ret void
@@ -115,6 +183,17 @@ define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data,
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_x2:
+; GFX1200:       ; %bb.0: ; %main_body
+; GFX1200-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], null offen
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_x2:
+; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], null offen
+; GFX1250-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
   ret void
@@ -133,6 +212,28 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_and(<4 x i32> inreg %rsrc, i
 ; GFX11-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4
 ; GFX11-NEXT:    buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_x1_offen_merged_and:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], null offen offset:4
+; GFX1200-NEXT:    buffer_store_b64 v[5:6], v0, s[0:3], null offen offset:28
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_x1_offen_merged_and:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_dual_add_nc_u32 v7, 4, v0 :: v_dual_add_nc_u32 v8, 8, v0
+; GFX1250-NEXT:    v_dual_add_nc_u32 v9, 12, v0 :: v_dual_add_nc_u32 v10, 16, v0
+; GFX1250-NEXT:    v_dual_add_nc_u32 v11, 28, v0 :: v_dual_add_nc_u32 v0, 32, v0
+; GFX1250-NEXT:    s_clause 0x5
+; GFX1250-NEXT:    buffer_store_b32 v1, v7, s[0:3], null offen
+; GFX1250-NEXT:    buffer_store_b32 v2, v8, s[0:3], null offen
+; GFX1250-NEXT:    buffer_store_b32 v3, v9, s[0:3], null offen
+; GFX1250-NEXT:    buffer_store_b32 v4, v10, s[0:3], null offen
+; GFX1250-NEXT:    buffer_store_b32 v5, v11, s[0:3], null offen
+; GFX1250-NEXT:    buffer_store_b32 v6, v0, s[0:3], null offen
+; GFX1250-NEXT:    s_endpgm
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
   %a3 = add i32 %a, 12
@@ -163,6 +264,26 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_or(<4 x i32> inreg %rsrc, i3
 ; GFX11-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4
 ; GFX11-NEXT:    buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_x1_offen_merged_or:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_lshlrev_b32_e32 v0, 6, v0
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], null offen offset:4
+; GFX1200-NEXT:    buffer_store_b64 v[5:6], v0, s[0:3], null offen offset:28
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_x1_offen_merged_or:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v6, v5
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 6, v0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    buffer_store_b128 v[2:5], v0, s[0:3], null offen offset:4
+; GFX1250-NEXT:    buffer_store_b64 v[6:7], v0, s[0:3], null offen offset:28
+; GFX1250-NEXT:    s_endpgm
   %a = shl i32 %inp, 6
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
@@ -194,6 +315,29 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(<4 x i32> inreg %rsr
 ; GFX11-NEXT:    buffer_store_b64 v[3:4], v0, s[0:3], 0 offen offset:12 glc
 ; GFX11-NEXT:    buffer_store_b64 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_x1_offen_merged_glc_slc:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x2
+; GFX1200-NEXT:    buffer_store_b64 v[1:2], v0, s[0:3], null offen offset:4
+; GFX1200-NEXT:    buffer_store_b64 v[3:4], v0, s[0:3], null offen offset:12 th:TH_STORE_NT
+; GFX1200-NEXT:    buffer_store_b64 v[5:6], v0, s[0:3], null offen offset:28 th:TH_STORE_WB
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_x1_offen_merged_glc_slc:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_dual_add_nc_u32 v7, 4, v0 :: v_dual_add_nc_u32 v8, 8, v0
+; GFX1250-NEXT:    v_dual_add_nc_u32 v9, 12, v0 :: v_dual_add_nc_u32 v10, 16, v0
+; GFX1250-NEXT:    v_dual_add_nc_u32 v11, 28, v0 :: v_dual_add_nc_u32 v0, 32, v0
+; GFX1250-NEXT:    s_clause 0x5
+; GFX1250-NEXT:    buffer_store_b32 v1, v7, s[0:3], null offen
+; GFX1250-NEXT:    buffer_store_b32 v2, v8, s[0:3], null offen
+; GFX1250-NEXT:    buffer_store_b32 v3, v9, s[0:3], null offen th:TH_STORE_NT
+; GFX1250-NEXT:    buffer_store_b32 v4, v10, s[0:3], null offen th:TH_STORE_NT
+; GFX1250-NEXT:    buffer_store_b32 v5, v11, s[0:3], null offen th:TH_STORE_WB
+; GFX1250-NEXT:    buffer_store_b32 v6, v0, s[0:3], null offen th:TH_STORE_WB
+; GFX1250-NEXT:    s_endpgm
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
   %a3 = add i32 %a, 12
@@ -219,6 +363,22 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_and(<4 x i32> inreg %rsrc, i
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_x2_offen_merged_and:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], null offen offset:4
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_x2_offen_merged_and:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT:    v_dual_add_nc_u32 v1, 4, v0 :: v_dual_add_nc_u32 v0, 12, v0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    buffer_store_b64 v[2:3], v1, s[0:3], null offen
+; GFX1250-NEXT:    buffer_store_b64 v[4:5], v0, s[0:3], null offen
+; GFX1250-NEXT:    s_endpgm
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 12
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 %a1, i32 0, i32 0)
@@ -238,6 +398,21 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_or(<4 x i32> inreg %rsrc, i3
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
 ; GFX11-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], 0 offen offset:4
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_x2_offen_merged_or:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1200-NEXT:    buffer_store_b128 v[1:4], v0, s[0:3], null offen offset:4
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_x2_offen_merged_or:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, v3
+; GFX1250-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; GFX1250-NEXT:    buffer_store_b128 v[2:5], v0, s[0:3], null offen offset:4
+; GFX1250-NEXT:    s_endpgm
   %a = shl i32 %inp, 4
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 12
@@ -259,6 +434,21 @@ define amdgpu_ps void @buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc, floa
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4
 ; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_x1_offset_merged:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null offset:4
+; GFX1200-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], null offset:28
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_x1_offset_merged:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null offset:4
+; GFX1250-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], null offset:28
+; GFX1250-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0)
@@ -278,6 +468,17 @@ define amdgpu_ps void @buffer_store_x2_offset_merged(<4 x i32> inreg %rsrc, <2 x
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_x2_offset_merged:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null offset:4
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_x2_offset_merged:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null offset:4
+; GFX1250-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %v2, <4 x i32> %rsrc, i32 12, i32 0, i32 0)
   ret void
@@ -298,6 +499,23 @@ define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i
 ; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0 glc
 ; GFX11-NEXT:    buffer_store_b32 v6, off, s[0:3], 0 slc
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_int:
+; GFX1200:       ; %bb.0: ; %main_body
+; GFX1200-NEXT:    s_clause 0x2
+; GFX1200-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null
+; GFX1200-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], null th:TH_STORE_NT
+; GFX1200-NEXT:    buffer_store_b32 v6, off, s[0:3], null th:TH_STORE_HT
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_int:
+; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_clause 0x2
+; GFX1250-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null
+; GFX1250-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], null th:TH_STORE_NT
+; GFX1250-NEXT:    buffer_store_b32 v6, off, s[0:3], null th:TH_STORE_HT
+; GFX1250-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4i32(<4 x i32> %1, <4 x i32> %0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.v2i32(<2 x i32> %2, <4 x i32> %0, i32 0, i32 0, i32 1)
@@ -317,6 +535,19 @@ define amdgpu_ps void @raw_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1) {
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: raw_buffer_store_byte:
+; GFX1200:       ; %bb.0: ; %main_body
+; GFX1200-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX1200-NEXT:    buffer_store_b8 v0, off, s[0:3], null
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_store_byte:
+; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX1250-NEXT:    buffer_store_b8 v0, off, s[0:3], null
+; GFX1250-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i8
@@ -336,6 +567,19 @@ define amdgpu_ps void @raw_buffer_store_short(<4 x i32> inreg %rsrc, float %v1)
 ; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: raw_buffer_store_short:
+; GFX1200:       ; %bb.0: ; %main_body
+; GFX1200-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX1200-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_store_short:
+; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX1250-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1250-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i16
@@ -353,6 +597,17 @@ define amdgpu_ps void @raw_buffer_store_f16(<4 x i32> inreg %rsrc, i32 %v1) {
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: raw_buffer_store_f16:
+; GFX1200:       ; %bb.0: ; %main_body
+; GFX1200-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_store_f16:
+; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1250-NEXT:    s_endpgm
 main_body:
   %trunc = trunc i32 %v1 to i16
   %cast = bitcast i16 %trunc to half
@@ -379,6 +634,17 @@ define amdgpu_ps void @buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x half> %dat
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 offen
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_v2f16:
+; GFX1200:       ; %bb.0: ; %main_body
+; GFX1200-NEXT:    buffer_store_b32 v0, v1, s[0:3], null offen
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_v2f16:
+; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    buffer_store_b32 v0, v1, s[0:3], null offen
+; GFX1250-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
   ret void
@@ -407,6 +673,17 @@ define amdgpu_ps void @buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x half> %dat
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_v4f16:
+; GFX1200:       ; %bb.0: ; %main_body
+; GFX1200-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], null offen
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_v4f16:
+; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], null offen
+; GFX1250-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
   ret void
@@ -422,6 +699,17 @@ define amdgpu_ps void @raw_buffer_store_i16(<4 x i32> inreg %rsrc, i32 %v1) {
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b16 v0, off, s[0:3], 0
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: raw_buffer_store_i16:
+; GFX1200:       ; %bb.0: ; %main_body
+; GFX1200-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_store_i16:
+; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    buffer_store_b16 v0, off, s[0:3], null
+; GFX1250-NEXT:    s_endpgm
 main_body:
   %trunc = trunc i32 %v1 to i16
   call void @llvm.amdgcn.raw.buffer.store.i16(i16 %trunc, <4 x i32> %rsrc, i32 0, i32 0, i32 0)
@@ -446,6 +734,17 @@ define amdgpu_ps void @buffer_store_v2i16(<4 x i32> inreg %rsrc, <2 x i16> %data
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b32 v0, v1, s[0:3], 0 offen
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_v2i16:
+; GFX1200:       ; %bb.0: ; %main_body
+; GFX1200-NEXT:    buffer_store_b32 v0, v1, s[0:3], null offen
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_v2i16:
+; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    buffer_store_b32 v0, v1, s[0:3], null offen
+; GFX1250-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v2i16(<2 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
   ret void
@@ -472,6 +771,17 @@ define amdgpu_ps void @buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16> %data
 ; GFX11:       ; %bb.0: ; %main_body
 ; GFX11-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], 0 offen
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: buffer_store_v4i16:
+; GFX1200:       ; %bb.0: ; %main_body
+; GFX1200-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], null offen
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: buffer_store_v4i16:
+; GFX1250:       ; %bb.0: ; %main_body
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], null offen
+; GFX1250-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.buffer.store.v4i16(<4 x i16> %data, <4 x i32> %rsrc, i32 %offset, i32 0, i32 0)
   ret void
@@ -490,6 +800,21 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_merged(<4 x i32> inreg %rsrc,
 ; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4
 ; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: raw_buffer_store_x1_offset_merged:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null offset:4
+; GFX1200-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], null offset:28
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_store_x1_offset_merged:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null offset:4
+; GFX1250-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], null offset:28
+; GFX1250-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 0)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 0)
@@ -520,6 +845,21 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12(<
 ; GFX11-NEXT:    buffer_store_b32 v4, off, s[0:3], 0 offset:28
 ; GFX11-NEXT:    buffer_store_b32 v5, off, s[0:3], 0 offset:32
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x1
+; GFX1200-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null offset:4 scope:SCOPE_SE
+; GFX1200-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], null offset:28 scope:SCOPE_SE
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], null offset:4 scope:SCOPE_SE
+; GFX1250-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], null offset:28 scope:SCOPE_SE
+; GFX1250-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 8)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 8)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 8)
@@ -530,16 +870,41 @@ define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged_pregfx12(<
 }
 
 define amdgpu_ps void @raw_buffer_store_x1_offset_swizzled_not_merged(<4 x i32> inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
-; GFX12-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    s_clause 0x5
-; GFX12-NEXT:    buffer_store_b32 v0, off, s[0:3], null offset:4
-; GFX12-NEXT:    buffer_store_b32 v1, off, s[0:3], null offset:8
-; GFX12-NEXT:    buffer_store_b32 v2, off, s[0:3], null offset:12
-; GFX12-NEXT:    buffer_store_b32 v3, off, s[0:3], null offset:16
-; GFX12-NEXT:    buffer_store_b32 v4, off, s[0:3], null offset:28
-; GFX12-NEXT:    buffer_store_b32 v5, off, s[0:3], null offset:32
-; GFX12-NEXT:    s_endpgm
+; GFX68-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
+; GFX68:       ; %bb.0:
+; GFX68-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; GFX68-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28
+; GFX68-NEXT:    s_endpgm
+;
+; GFX11-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_clause 0x1
+; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[0:3], 0 offset:4
+; GFX11-NEXT:    buffer_store_b64 v[4:5], off, s[0:3], 0 offset:28
+; GFX11-NEXT:    s_endpgm
+;
+; GFX1200-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    s_clause 0x5
+; GFX1200-NEXT:    buffer_store_b32 v0, off, s[0:3], null offset:4
+; GFX1200-NEXT:    buffer_store_b32 v1, off, s[0:3], null offset:8
+; GFX1200-NEXT:    buffer_store_b32 v2, off, s[0:3], null offset:12
+; GFX1200-NEXT:    buffer_store_b32 v3, off, s[0:3], null offset:16
+; GFX1200-NEXT:    buffer_store_b32 v4, off, s[0:3], null offset:28
+; GFX1200-NEXT:    buffer_store_b32 v5, off, s[0:3], null offset:32
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: raw_buffer_store_x1_offset_swizzled_not_merged:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_clause 0x5
+; GFX1250-NEXT:    buffer_store_b32 v0, off, s[0:3], null offset:4
+; GFX1250-NEXT:    buffer_store_b32 v1, off, s[0:3], null offset:8
+; GFX1250-NEXT:    buffer_store_b32 v2, off, s[0:3], null offset:12
+; GFX1250-NEXT:    buffer_store_b32 v3, off, s[0:3], null offset:16
+; GFX1250-NEXT:    buffer_store_b32 v4, off, s[0:3], null offset:28
+; GFX1250-NEXT:    buffer_store_b32 v5, off, s[0:3], null offset:32
+; GFX1250-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v1, <4 x i32> %rsrc, i32 4, i32 0, i32 64)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v2, <4 x i32> %rsrc, i32 8, i32 0, i32 64)
   call void @llvm.amdgcn.raw.buffer.store.f32(float %v3, <4 x i32> %rsrc, i32 12, i32 0, i32 64)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
index 7e8f490e6a7f9..1f8e30152f09d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.atomic.buffer.load.ll
@@ -31,6 +31,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_ptr_load_i32(ptr addrspace(8) %
 ;
 ; GFX12-LABEL: raw_ptr_atomic_buffer_ptr_load_i32:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_xcnt 0x0
@@ -79,6 +80,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_off(ptr addrspace(8) %
 ;
 ; GFX12-LABEL: raw_ptr_atomic_buffer_load_i32_off:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_xcnt 0x0
@@ -126,6 +128,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_soff(ptr addrspace(8)
 ;
 ; GFX12-LABEL: raw_ptr_atomic_buffer_load_i32_soff:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_xcnt 0x0
@@ -174,6 +177,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8) %
 ;
 ; GFX12-LABEL: raw_ptr_atomic_buffer_load_i32_dlc:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_xcnt 0x0
@@ -223,6 +227,7 @@ define amdgpu_kernel void @raw_nonptr_atomic_buffer_load_i32(ptr addrspace(8) %p
 ;
 ; GFX12-LABEL: raw_nonptr_atomic_buffer_load_i32:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
@@ -272,6 +277,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr)
 ;
 ; GFX12-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i64:
 ; GFX12-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
@@ -292,6 +298,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_i64(ptr addrspace(8) %ptr)
 ;
 ; GFX12-GISEL-TRUE16-LABEL: raw_ptr_atomic_buffer_load_i64:
 ; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-GISEL-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
@@ -342,6 +349,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v2i16(ptr addrspace(8) %pt
 ;
 ; GFX12-LABEL: raw_ptr_atomic_buffer_load_v2i16:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_xcnt 0x0
@@ -453,6 +461,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt
 ;
 ; GFX12-SDAG-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
 ; GFX12-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-SDAG-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-SDAG-TRUE16-NEXT:    s_wait_xcnt 0x0
@@ -473,6 +482,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt
 ;
 ; GFX12-FAKE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
 ; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-FAKE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-FAKE16-NEXT:    s_wait_xcnt 0x0
@@ -495,6 +505,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i16(ptr addrspace(8) %pt
 ;
 ; GFX12-GISEL-TRUE16-LABEL: raw_ptr_atomic_buffer_load_v4i16:
 ; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-GISEL-TRUE16-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-GISEL-TRUE16-NEXT:    s_wait_xcnt 0x0
@@ -546,6 +557,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_v4i32(ptr addrspace(8) %pt
 ;
 ; GFX12-LABEL: raw_ptr_atomic_buffer_load_v4i32:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_xcnt 0x0
@@ -597,6 +609,7 @@ define amdgpu_kernel void @raw_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %ptr)
 ;
 ; GFX12-LABEL: raw_ptr_atomic_buffer_load_ptr:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    s_wait_xcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
index ec7d7d467ffc6..5894073ea47e3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.bf16.ll
@@ -36,6 +36,7 @@ define amdgpu_ps void @buffer_store_bf16(ptr addrspace(8) inreg %rsrc, bfloat %d
 ;
 ; GFX12-LABEL: buffer_store_bf16:
 ; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    buffer_store_b16 v0, v1, s[0:3], null offen
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.bf16(bfloat %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
@@ -74,6 +75,7 @@ define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bf
 ;
 ; GFX12-LABEL: buffer_store_v2bf16:
 ; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    buffer_store_b32 v0, v1, s[0:3], null offen
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2bf16(<2 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
@@ -116,6 +118,7 @@ define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bf
 ;
 ; GFX12-LABEL: buffer_store_v4bf16:
 ; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], null offen
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
@@ -172,6 +175,7 @@ define amdgpu_ps void @buffer_store_v8bf16(ptr addrspace(8) inreg %rsrc, <8 x bf
 ;
 ; GFX12-LABEL: buffer_store_v8bf16:
 ; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], null offen
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.v8bf16(<8 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll
index dbea8325d89f6..24dd47a978a38 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.bf16.ll
@@ -11,6 +11,7 @@ declare bfloat @llvm.amdgcn.rcp.bf16(bfloat) #0
 define amdgpu_kernel void @rcp_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
 ; SDAG-TRUE16-LABEL: rcp_bf16:
 ; SDAG-TRUE16:       ; %bb.0:
+; SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-TRUE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-TRUE16-NEXT:    s_wait_kmcnt 0x0
@@ -20,6 +21,7 @@ define amdgpu_kernel void @rcp_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
 ;
 ; SDAG-FAKE16-LABEL: rcp_bf16:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
@@ -34,6 +36,7 @@ define amdgpu_kernel void @rcp_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
 define amdgpu_kernel void @rcp_bf16_constant_4(ptr addrspace(1) %out) #1 {
 ; SDAG-TRUE16-LABEL: rcp_bf16_constant_4:
 ; SDAG-TRUE16:       ; %bb.0:
+; SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3e80
@@ -43,6 +46,7 @@ define amdgpu_kernel void @rcp_bf16_constant_4(ptr addrspace(1) %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: rcp_bf16_constant_4:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3e80
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
@@ -56,6 +60,7 @@ define amdgpu_kernel void @rcp_bf16_constant_4(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @rcp_bf16_constant_100(ptr addrspace(1) %out) #1 {
 ; SDAG-TRUE16-LABEL: rcp_bf16_constant_100:
 ; SDAG-TRUE16:       ; %bb.0:
+; SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x3c24
@@ -65,6 +70,7 @@ define amdgpu_kernel void @rcp_bf16_constant_100(ptr addrspace(1) %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: rcp_bf16_constant_100:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x3c24
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
@@ -78,6 +84,7 @@ define amdgpu_kernel void @rcp_bf16_constant_100(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @rcp_undef_bf16(ptr addrspace(1) %out) #1 {
 ; SDAG-TRUE16-LABEL: rcp_undef_bf16:
 ; SDAG-TRUE16:       ; %bb.0:
+; SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-TRUE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-TRUE16-NEXT:    v_mov_b16_e32 v0.l, 0x7fc0
@@ -87,6 +94,7 @@ define amdgpu_kernel void @rcp_undef_bf16(ptr addrspace(1) %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: rcp_undef_bf16:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x7fc0
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll
index 662dc61301032..7f98d96737f21 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.bf16.ll
@@ -11,6 +11,7 @@ declare bfloat @llvm.amdgcn.rsq.bf16(bfloat) #0
 define amdgpu_kernel void @rsq_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
 ; SDAG-REAL16-LABEL: rsq_bf16:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
@@ -20,6 +21,7 @@ define amdgpu_kernel void @rsq_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
 ;
 ; SDAG-FAKE16-LABEL: rsq_bf16:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
@@ -34,6 +36,7 @@ define amdgpu_kernel void @rsq_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
 define amdgpu_kernel void @rsq_bf16_constant_4(ptr addrspace(1) %out) #1 {
 ; SDAG-REAL16-LABEL: rsq_bf16_constant_4:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-REAL16-NEXT:    v_rsq_bf16_e32 v0.l, 4.0
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
@@ -43,6 +46,7 @@ define amdgpu_kernel void @rsq_bf16_constant_4(ptr addrspace(1) %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: rsq_bf16_constant_4:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_rsq_bf16_e32 v0, 4.0
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
@@ -57,6 +61,7 @@ define amdgpu_kernel void @rsq_bf16_constant_4(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @rsq_bf16_constant_100(ptr addrspace(1) %out) #1 {
 ; SDAG-REAL16-LABEL: rsq_bf16_constant_100:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-REAL16-NEXT:    v_rsq_bf16_e32 v0.l, 0x42c8
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
@@ -66,6 +71,7 @@ define amdgpu_kernel void @rsq_bf16_constant_100(ptr addrspace(1) %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: rsq_bf16_constant_100:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_rsq_bf16_e32 v0, 0x42c8
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
@@ -80,10 +86,12 @@ define amdgpu_kernel void @rsq_bf16_constant_100(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @rsq_undef_bf16(ptr addrspace(1) %out) #1 {
 ; SDAG-REAL16-LABEL: rsq_undef_bf16:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; SDAG-FAKE16-LABEL: rsq_undef_bf16:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_endpgm
   %rsq = call bfloat @llvm.amdgcn.rsq.bf16(bfloat undef)
   store bfloat %rsq, ptr addrspace(1) %out, align 2

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll
index 2173d07baa57e..b0574bf021014 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx1250.ll
@@ -5,6 +5,7 @@
 define amdgpu_ps void @test_asynccnt() {
 ; GFX12-LABEL: test_asynccnt:
 ; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_wait_asynccnt 0x0
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.wait.asynccnt(i16 0)
@@ -14,6 +15,7 @@ define amdgpu_ps void @test_asynccnt() {
 define amdgpu_ps void @test_tensorcnt() {
 ; GFX12-LABEL: test_tensorcnt:
 ; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_wait_tensorcnt 0x0
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.s.wait.tensorcnt(i16 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
index 94210640db14e..4905c6d8aa81b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll
@@ -25,6 +25,7 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) {
 ;
 ; GFX1250-SDAG-LABEL: test_get_doorbell:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL)
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -34,6 +35,7 @@ define amdgpu_kernel void @test_get_doorbell(ptr addrspace(1) %out) {
 ;
 ; GFX1250-GISEL-LABEL: test_get_doorbell:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL)
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -66,6 +68,7 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) {
 ;
 ; GFX1250-SDAG-LABEL: test_get_ddid:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID)
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -75,6 +78,7 @@ define amdgpu_kernel void @test_get_ddid(ptr addrspace(1) %out) {
 ;
 ; GFX1250-GISEL-LABEL: test_get_ddid:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID)
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -99,6 +103,7 @@ define amdgpu_kernel void @test_get_tma(ptr addrspace(1) %out) {
 ;
 ; GFX1250-LABEL: test_get_tma:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA)
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -124,6 +129,7 @@ define amdgpu_kernel void @test_get_realtime(ptr addrspace(1) %out) {
 ;
 ; GFX1250-LABEL: test_get_realtime:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME)
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -157,6 +163,7 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) {
 ;
 ; GFX1250-SDAG-LABEL: test_savewave:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE)
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -166,6 +173,7 @@ define amdgpu_kernel void @test_savewave(ptr addrspace(1) %out) {
 ;
 ; GFX1250-GISEL-LABEL: test_savewave:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE)
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -190,6 +198,7 @@ define amdgpu_kernel void @test_get_tba(ptr addrspace(1) %out) {
 ;
 ; GFX1250-LABEL: test_get_tba:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA)
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -223,6 +232,7 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) {
 ;
 ; GFX1250-SDAG-LABEL: test_get_0_i32:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0)
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -232,6 +242,7 @@ define amdgpu_kernel void @test_get_0_i32(ptr addrspace(1) %out) {
 ;
 ; GFX1250-GISEL-LABEL: test_get_0_i32:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0)
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -256,6 +267,7 @@ define amdgpu_kernel void @test_get_99999_i64(ptr addrspace(1) %out) {
 ;
 ; GFX1250-LABEL: test_get_99999_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    s_sendmsg_rtn_b64 s[2:3], 99999
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
@@ -281,6 +293,7 @@ define amdgpu_kernel void @test_get_136_i64(ptr addrspace(1) %out) {
 ;
 ; GFX1250-LABEL: test_get_136_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_CLUSTER_BARRIER_STATE)
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
index 1070e95cda783..47eafd53a9bd3 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.atomic.buffer.load.ll
@@ -34,6 +34,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32(<4 x i32> %addr, i32 %i
 ;
 ; GFX12-LABEL: struct_atomic_buffer_load_i32:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -85,6 +86,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_const_idx(<4 x i32> %ad
 ;
 ; GFX12-LABEL: struct_atomic_buffer_load_i32_const_idx:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
@@ -137,6 +139,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_off(<4 x i32> %addr, i3
 ;
 ; GFX12-LABEL: struct_atomic_buffer_load_i32_off:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -191,6 +194,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_soff(<4 x i32> %addr, i
 ;
 ; GFX12-LABEL: struct_atomic_buffer_load_i32_soff:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -245,6 +249,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i32_dlc(<4 x i32> %addr, i3
 ;
 ; GFX12-LABEL: struct_atomic_buffer_load_i32_dlc:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -299,6 +304,7 @@ define amdgpu_kernel void @struct_nonatomic_buffer_load_i32(<4 x i32> %addr, i32
 ;
 ; GFX12-LABEL: struct_nonatomic_buffer_load_i32:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -354,6 +360,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i
 ;
 ; GFX12-SDAG-TRUE16-LABEL: struct_atomic_buffer_load_i64:
 ; GFX12-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-SDAG-TRUE16-NEXT:    s_clause 0x1
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -377,6 +384,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_i64(<4 x i32> %addr, i32 %i
 ;
 ; GFX12-GISEL-TRUE16-LABEL: struct_atomic_buffer_load_i64:
 ; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-GISEL-TRUE16-NEXT:    s_clause 0x1
 ; GFX12-GISEL-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -433,6 +441,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v2i16(<4 x i32> %addr, i32
 ;
 ; GFX12-LABEL: struct_atomic_buffer_load_v2i16:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -559,6 +568,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32
 ;
 ; GFX12-SDAG-TRUE16-LABEL: struct_atomic_buffer_load_v4i16:
 ; GFX12-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-SDAG-TRUE16-NEXT:    s_clause 0x1
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -582,6 +592,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32
 ;
 ; GFX12-FAKE16-LABEL: struct_atomic_buffer_load_v4i16:
 ; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-FAKE16-NEXT:    s_clause 0x1
 ; GFX12-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -607,6 +618,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i16(<4 x i32> %addr, i32
 ;
 ; GFX12-GISEL-TRUE16-LABEL: struct_atomic_buffer_load_v4i16:
 ; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-GISEL-TRUE16-NEXT:    s_clause 0x1
 ; GFX12-GISEL-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -664,6 +676,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_v4i32(<4 x i32> %addr, i32
 ;
 ; GFX12-LABEL: struct_atomic_buffer_load_v4i32:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -721,6 +734,7 @@ define amdgpu_kernel void @struct_atomic_buffer_load_ptr(<4 x i32> %addr, i32 %i
 ;
 ; GFX12-LABEL: struct_atomic_buffer_load_ptr:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
index 9abbc064803da..71e51c4322056 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.tfe.ll
@@ -5,8 +5,8 @@
 ; RUN: llc -mcpu=gfx900 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX910,GFX9
 ; RUN: llc -mcpu=gfx1010 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefixes=GFX910,GFX10
 ; RUN: llc -mcpu=gfx1100 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX11
-; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12
-; RUN: llc -mcpu=gfx1250 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -mcpu=gfx1200 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX1200
+; RUN: llc -mcpu=gfx1250 -mtriple=amdgcn-- < %s | FileCheck %s -check-prefix=GFX1250
 
 define amdgpu_ps void @struct_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addrspace(1) %data_addr, ptr addrspace(1) %tfe_addr) {
 ; GFX67-LABEL: struct_buffer_load_i8_tfe:
@@ -54,16 +54,28 @@ define amdgpu_ps void @struct_buffer_load_i8_tfe(<4 x i32> inreg %rsrc, ptr addr
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: struct_buffer_load_i8_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v5, v4
-; GFX12-NEXT:    buffer_load_u8 v[4:5], v4, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b8 v[0:1], v4, off
-; GFX12-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: struct_buffer_load_i8_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1200-NEXT:    buffer_load_u8 v[4:5], v4, s[0:3], null idxen tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b8 v[0:1], v4, off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_load_i8_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1250-NEXT:    buffer_load_u8 v[4:5], v4, s[0:3], null idxen tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b8 v[0:1], v4, off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { i8, i32 } @llvm.amdgcn.struct.buffer.load.sl_i8i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { i8, i32 } %res, 0
   store i8 %data, ptr addrspace(1) %data_addr
@@ -118,16 +130,28 @@ define amdgpu_ps void @struct_buffer_load_i16_tfe(<4 x i32> inreg %rsrc, ptr add
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: struct_buffer_load_i16_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v5, v4
-; GFX12-NEXT:    buffer_load_u16 v[4:5], v4, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b16 v[0:1], v4, off
-; GFX12-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: struct_buffer_load_i16_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1200-NEXT:    buffer_load_u16 v[4:5], v4, s[0:3], null idxen tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_load_i16_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1250-NEXT:    buffer_load_u16 v[4:5], v4, s[0:3], null idxen tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { i16, i32 } @llvm.amdgcn.struct.buffer.load.sl_i16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { i16, i32 } %res, 0
   store i16 %data, ptr addrspace(1) %data_addr
@@ -182,16 +206,28 @@ define amdgpu_ps void @struct_buffer_load_f16_tfe(<4 x i32> inreg %rsrc, ptr add
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: struct_buffer_load_f16_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v5, v4
-; GFX12-NEXT:    buffer_load_u16 v[4:5], v4, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b16 v[0:1], v4, off
-; GFX12-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: struct_buffer_load_f16_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1200-NEXT:    buffer_load_u16 v[4:5], v4, s[0:3], null idxen tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_load_f16_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1250-NEXT:    buffer_load_u16 v[4:5], v4, s[0:3], null idxen tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b16 v[0:1], v4, off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { half, i32 } @llvm.amdgcn.struct.buffer.load.sl_f16i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { half, i32 } %res, 0
   store half %data, ptr addrspace(1) %data_addr
@@ -246,16 +282,28 @@ define amdgpu_ps void @struct_buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr add
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: struct_buffer_load_i32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_mov_b32_e32 v5, v4
-; GFX12-NEXT:    buffer_load_b32 v[4:5], v4, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b32 v[0:1], v4, off
-; GFX12-NEXT:    global_store_b32 v[2:3], v5, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: struct_buffer_load_i32_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1200-NEXT:    buffer_load_b32 v[4:5], v4, s[0:3], null idxen tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_load_i32_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v5, v4
+; GFX1250-NEXT:    buffer_load_b32 v[4:5], v4, s[0:3], null idxen tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { i32, i32 } @llvm.amdgcn.struct.buffer.load.sl_i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { i32, i32 } %res, 0
   store i32 %data, ptr addrspace(1) %data_addr
@@ -330,16 +378,28 @@ define amdgpu_ps void @struct_buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX11-NEXT:    global_store_b32 v[2:3], v6, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: struct_buffer_load_v2i32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    buffer_load_b64 v[4:6], v4, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b64 v[0:1], v[4:5], off
-; GFX12-NEXT:    global_store_b32 v[2:3], v6, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: struct_buffer_load_v2i32_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1200-NEXT:    buffer_load_b64 v[4:6], v4, s[0:3], null idxen tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b64 v[0:1], v[4:5], off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v6, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_load_v2i32_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1250-NEXT:    buffer_load_b64 v[4:6], v4, s[0:3], null idxen tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b64 v[0:1], v[4:5], off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v6, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { <2 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v2i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <2 x i32>, i32 } %res, 0
   store <2 x i32> %data, ptr addrspace(1) %data_addr
@@ -414,16 +474,28 @@ define amdgpu_ps void @struct_buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX11-NEXT:    global_store_b32 v[2:3], v6, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: struct_buffer_load_v2f32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    buffer_load_b64 v[4:6], v4, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b64 v[0:1], v[4:5], off
-; GFX12-NEXT:    global_store_b32 v[2:3], v6, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: struct_buffer_load_v2f32_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1200-NEXT:    buffer_load_b64 v[4:6], v4, s[0:3], null idxen tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b64 v[0:1], v[4:5], off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v6, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_load_v2f32_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1250-NEXT:    buffer_load_b64 v[4:6], v4, s[0:3], null idxen tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b64 v[0:1], v[4:5], off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v6, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { <2 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v2f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <2 x float>, i32 } %res, 0
   store <2 x float> %data, ptr addrspace(1) %data_addr
@@ -503,17 +575,30 @@ define amdgpu_ps void @struct_buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX11-NEXT:    global_store_b32 v[2:3], v7, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: struct_buffer_load_v3i32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_mov_b32_e32 v7, v4
-; GFX12-NEXT:    buffer_load_b96 v[4:7], v4, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b96 v[0:1], v[4:6], off
-; GFX12-NEXT:    global_store_b32 v[2:3], v7, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: struct_buffer_load_v3i32_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1200-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1200-NEXT:    buffer_load_b96 v[4:7], v4, s[0:3], null idxen tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b96 v[0:1], v[4:6], off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v7, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_load_v3i32_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1250-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1250-NEXT:    buffer_load_b96 v[4:7], v4, s[0:3], null idxen tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b96 v[0:1], v[4:6], off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v7, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { <3 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v3i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <3 x i32>, i32 } %res, 0
   store <3 x i32> %data, ptr addrspace(1) %data_addr
@@ -593,17 +678,30 @@ define amdgpu_ps void @struct_buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX11-NEXT:    global_store_b32 v[2:3], v7, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: struct_buffer_load_v3f32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_mov_b32_e32 v7, v4
-; GFX12-NEXT:    buffer_load_b96 v[4:7], v4, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b96 v[0:1], v[4:6], off
-; GFX12-NEXT:    global_store_b32 v[2:3], v7, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: struct_buffer_load_v3f32_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1200-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1200-NEXT:    buffer_load_b96 v[4:7], v4, s[0:3], null idxen tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b96 v[0:1], v[4:6], off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v7, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_load_v3f32_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1250-NEXT:    v_mov_b32_e32 v7, v4
+; GFX1250-NEXT:    buffer_load_b96 v[4:7], v4, s[0:3], null idxen tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b96 v[0:1], v[4:6], off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v7, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { <3 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v3f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <3 x float>, i32 } %res, 0
   store <3 x float> %data, ptr addrspace(1) %data_addr
@@ -670,17 +768,30 @@ define amdgpu_ps void @struct_buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX11-NEXT:    global_store_b32 v[2:3], v8, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: struct_buffer_load_v4i32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
-; GFX12-NEXT:    buffer_load_b128 v[4:8], v4, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b128 v[0:1], v[4:7], off
-; GFX12-NEXT:    global_store_b32 v[2:3], v8, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: struct_buffer_load_v4i32_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1200-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
+; GFX1200-NEXT:    buffer_load_b128 v[4:8], v4, s[0:3], null idxen tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v8, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_load_v4i32_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1250-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
+; GFX1250-NEXT:    buffer_load_b128 v[4:8], v4, s[0:3], null idxen tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v8, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <4 x i32>, i32 } %res, 0
   store <4 x i32> %data, ptr addrspace(1) %data_addr
@@ -747,17 +858,30 @@ define amdgpu_ps void @struct_buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr a
 ; GFX11-NEXT:    global_store_b32 v[2:3], v8, off
 ; GFX11-NEXT:    s_endpgm
 ;
-; GFX12-LABEL: struct_buffer_load_v4f32_tfe:
-; GFX12:       ; %bb.0:
-; GFX12-NEXT:    v_mov_b32_e32 v4, 0
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
-; GFX12-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
-; GFX12-NEXT:    buffer_load_b128 v[4:8], v4, s[0:3], null idxen tfe
-; GFX12-NEXT:    s_wait_loadcnt 0x0
-; GFX12-NEXT:    global_store_b128 v[0:1], v[4:7], off
-; GFX12-NEXT:    global_store_b32 v[2:3], v8, off
-; GFX12-NEXT:    s_endpgm
+; GFX1200-LABEL: struct_buffer_load_v4f32_tfe:
+; GFX1200:       ; %bb.0:
+; GFX1200-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1200-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1200-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1200-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
+; GFX1200-NEXT:    buffer_load_b128 v[4:8], v4, s[0:3], null idxen tfe
+; GFX1200-NEXT:    s_wait_loadcnt 0x0
+; GFX1200-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX1200-NEXT:    global_store_b32 v[2:3], v8, off
+; GFX1200-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: struct_buffer_load_v4f32_tfe:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX1250-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
+; GFX1250-NEXT:    buffer_load_b128 v[4:8], v4, s[0:3], null idxen tfe
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    global_store_b128 v[0:1], v[4:7], off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v8, off
+; GFX1250-NEXT:    s_endpgm
   %res = call { <4 x float>, i32 } @llvm.amdgcn.struct.buffer.load.sl_v4f32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
   %data = extractvalue { <4 x float>, i32 } %res, 0
   store <4 x float> %data, ptr addrspace(1) %data_addr

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
index 822016b23c952..4bde9db509f13 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.store.ll
@@ -26,6 +26,7 @@ define amdgpu_ps void @buffer_store(<4 x i32> inreg, <4 x float>, <4 x float>, <
 ;
 ; GFX12-LABEL: buffer_store:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    v_mov_b32_e32 v12, 0
 ; GFX12-NEXT:    s_clause 0x2
 ; GFX12-NEXT:    buffer_store_b128 v[0:3], v12, s[0:3], null idxen
@@ -54,6 +55,7 @@ define amdgpu_ps void @buffer_store_immoffs(<4 x i32> inreg, <4 x float>) {
 ;
 ; GFX12-LABEL: buffer_store_immoffs:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], null idxen offset:42
 ; GFX12-NEXT:    s_endpgm
@@ -75,6 +77,7 @@ define amdgpu_ps void @buffer_store_idx(<4 x i32> inreg, <4 x float>, i32) {
 ;
 ; GFX12-LABEL: buffer_store_idx:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], null idxen
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -101,6 +104,7 @@ define amdgpu_ps void @buffer_store_ofs(<4 x i32> inreg, <4 x float>, i32) {
 ;
 ; GFX12-LABEL: buffer_store_ofs:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v4, 0
 ; GFX12-NEXT:    buffer_store_b128 v[0:3], v[4:5], s[0:3], null idxen offen
 ; GFX12-NEXT:    s_endpgm
@@ -122,6 +126,7 @@ define amdgpu_ps void @buffer_store_both(<4 x i32> inreg, <4 x float>, i32, i32)
 ;
 ; GFX12-LABEL: buffer_store_both:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    buffer_store_b128 v[0:3], v[4:5], s[0:3], null idxen offen
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -144,6 +149,7 @@ define amdgpu_ps void @buffer_store_both_reversed(<4 x i32> inreg, <4 x float>,
 ;
 ; GFX12-LABEL: buffer_store_both_reversed:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    v_dual_mov_b32 v6, v5 :: v_dual_mov_b32 v7, v4
 ; GFX12-NEXT:    buffer_store_b128 v[0:3], v[6:7], s[0:3], null idxen offen
 ; GFX12-NEXT:    s_endpgm
@@ -181,6 +187,7 @@ define amdgpu_ps void @buffer_store_wait(<4 x i32> inreg, <4 x float>, i32, i32,
 ;
 ; GFX12-LABEL: buffer_store_wait:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    buffer_store_b128 v[0:3], v4, s[0:3], null idxen
 ; GFX12-NEXT:    buffer_load_b128 v[0:3], v5, s[0:3], null idxen
@@ -207,6 +214,7 @@ define amdgpu_ps void @buffer_store_x1(<4 x i32> inreg %rsrc, float %data, i32 %
 ;
 ; GFX12-LABEL: buffer_store_x1:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    buffer_store_b32 v0, v1, s[0:3], null idxen
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -227,6 +235,7 @@ define amdgpu_ps void @buffer_store_x2(<4 x i32> inreg %rsrc, <2 x float> %data,
 ;
 ; GFX12-LABEL: buffer_store_x2:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], null idxen
 ; GFX12-NEXT:    s_endpgm
 main_body:
@@ -254,6 +263,7 @@ define amdgpu_ps void @buffer_store_int(<4 x i32> inreg, <4 x i32>, <2 x i32>, i
 ;
 ; GFX12-LABEL: buffer_store_int:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    v_mov_b32_e32 v7, 0
 ; GFX12-NEXT:    s_clause 0x2
 ; GFX12-NEXT:    buffer_store_b128 v[0:3], v7, s[0:3], null idxen
@@ -282,6 +292,7 @@ define amdgpu_ps void @struct_buffer_store_byte(<4 x i32> inreg %rsrc, float %v1
 ;
 ; GFX12-LABEL: struct_buffer_store_byte:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX12-NEXT:    buffer_store_b8 v0, v1, s[0:3], null idxen
 ; GFX12-NEXT:    s_endpgm
@@ -313,12 +324,16 @@ define amdgpu_ps void @struct_buffer_store_f16(<4 x i32> inreg %rsrc, float %v1,
 ;
 ; GFX12-TRUE16-LABEL: struct_buffer_store_f16:
 ; GFX12-TRUE16:       ; %bb.0:
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX12-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
 ; GFX12-TRUE16-NEXT:    buffer_store_b16 v0, v1, s[0:3], null idxen
 ; GFX12-TRUE16-NEXT:    s_endpgm
 ;
 ; GFX12-FAKE16-LABEL: struct_buffer_store_f16:
 ; GFX12-FAKE16:       ; %bb.0:
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GFX12-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX12-FAKE16-NEXT:    buffer_store_b16 v0, v1, s[0:3], null idxen
 ; GFX12-FAKE16-NEXT:    s_endpgm
@@ -349,6 +364,7 @@ define amdgpu_ps void @struct_buffer_store_v2f16(<4 x i32> inreg %rsrc, <2 x hal
 ;
 ; GFX12-LABEL: struct_buffer_store_v2f16:
 ; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    buffer_store_b32 v0, v1, s[0:3], null idxen
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.buffer.store.v2f16(<2 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
@@ -381,6 +397,7 @@ define amdgpu_ps void @struct_buffer_store_v4f16(<4 x i32> inreg %rsrc, <4 x hal
 ;
 ; GFX12-LABEL: struct_buffer_store_v4f16:
 ; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], null idxen
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.buffer.store.v4f16(<4 x half> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
@@ -402,6 +419,7 @@ define amdgpu_ps void @struct_buffer_store_i16(<4 x i32> inreg %rsrc, float %v1,
 ;
 ; GFX12-LABEL: struct_buffer_store_i16:
 ; GFX12:       ; %bb.0: ; %main_body
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX12-NEXT:    buffer_store_b16 v0, v1, s[0:3], null idxen
 ; GFX12-NEXT:    s_endpgm
@@ -433,6 +451,7 @@ define amdgpu_ps void @struct_buffer_store_vif16(<4 x i32> inreg %rsrc, <2 x i16
 ;
 ; GFX12-LABEL: struct_buffer_store_vif16:
 ; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    buffer_store_b32 v0, v1, s[0:3], null idxen
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.buffer.store.v2i16(<2 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)
@@ -463,6 +482,7 @@ define amdgpu_ps void @struct_buffer_store_v4i16(<4 x i32> inreg %rsrc, <4 x i16
 ;
 ; GFX12-LABEL: struct_buffer_store_v4i16:
 ; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    buffer_store_b64 v[0:1], v2, s[0:3], null idxen
 ; GFX12-NEXT:    s_endpgm
   call void @llvm.amdgcn.struct.buffer.store.v4i16(<4 x i16> %v1, <4 x i32> %rsrc, i32 %index, i32 0, i32 0, i32 0)

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
index 1e4b43d1f4fce..c2c8580de937c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.atomic.buffer.load.ll
@@ -34,6 +34,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32(ptr addrspace(8) %p
 ;
 ; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -85,6 +86,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_const_idx(ptr addrs
 ;
 ; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_const_idx:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX12-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX12-NEXT:    v_mov_b32_e32 v1, 15
@@ -137,6 +139,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_off(ptr addrspace(8
 ;
 ; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_off:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -191,6 +194,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_soff(ptr addrspace(
 ;
 ; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_soff:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -245,6 +249,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i32_dlc(ptr addrspace(8
 ;
 ; GFX12-LABEL: struct_ptr_atomic_buffer_load_i32_dlc:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -299,6 +304,7 @@ define amdgpu_kernel void @struct_ptr_nonatomic_buffer_load_i32(ptr addrspace(8)
 ;
 ; GFX12-LABEL: struct_ptr_nonatomic_buffer_load_i32:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -354,6 +360,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p
 ;
 ; GFX12-SDAG-TRUE16-LABEL: struct_ptr_atomic_buffer_load_i64:
 ; GFX12-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-SDAG-TRUE16-NEXT:    s_clause 0x1
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -377,6 +384,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_i64(ptr addrspace(8) %p
 ;
 ; GFX12-GISEL-TRUE16-LABEL: struct_ptr_atomic_buffer_load_i64:
 ; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-GISEL-TRUE16-NEXT:    s_clause 0x1
 ; GFX12-GISEL-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -433,6 +441,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v2i16(ptr addrspace(8)
 ;
 ; GFX12-LABEL: struct_ptr_atomic_buffer_load_v2i16:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -559,6 +568,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8)
 ;
 ; GFX12-SDAG-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
 ; GFX12-SDAG-TRUE16:       ; %bb.0: ; %bb
+; GFX12-SDAG-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-SDAG-TRUE16-NEXT:    s_clause 0x1
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-SDAG-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -582,6 +592,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8)
 ;
 ; GFX12-FAKE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
 ; GFX12-FAKE16:       ; %bb.0: ; %bb
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-FAKE16-NEXT:    s_clause 0x1
 ; GFX12-FAKE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -607,6 +618,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i16(ptr addrspace(8)
 ;
 ; GFX12-GISEL-TRUE16-LABEL: struct_ptr_atomic_buffer_load_v4i16:
 ; GFX12-GISEL-TRUE16:       ; %bb.0: ; %bb
+; GFX12-GISEL-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-GISEL-TRUE16-NEXT:    s_clause 0x1
 ; GFX12-GISEL-TRUE16-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-GISEL-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -664,6 +676,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_v4i32(ptr addrspace(8)
 ;
 ; GFX12-LABEL: struct_ptr_atomic_buffer_load_v4i32:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -721,6 +734,7 @@ define amdgpu_kernel void @struct_ptr_atomic_buffer_load_ptr(ptr addrspace(8) %p
 ;
 ; GFX12-LABEL: struct_ptr_atomic_buffer_load_ptr:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_clause 0x1
 ; GFX12-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll
index ba769eff08dee..42fa032c344f8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tanh.ll
@@ -13,6 +13,7 @@ declare bfloat @llvm.amdgcn.tanh.bf16(bfloat) #0
 define amdgpu_kernel void @tanh_f32(ptr addrspace(1) %out, float %src) #1 {
 ; SDAG-REAL16-LABEL: tanh_f32:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
@@ -22,6 +23,7 @@ define amdgpu_kernel void @tanh_f32(ptr addrspace(1) %out, float %src) #1 {
 ;
 ; SDAG-FAKE16-LABEL: tanh_f32:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
@@ -37,6 +39,7 @@ define amdgpu_kernel void @tanh_f32(ptr addrspace(1) %out, float %src) #1 {
 define amdgpu_kernel void @tanh_f32_constant_4.0(ptr addrspace(1) %out) #1 {
 ; SDAG-REAL16-LABEL: tanh_f32_constant_4.0:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-REAL16-NEXT:    v_tanh_f32_e32 v0, 4.0
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
@@ -46,6 +49,7 @@ define amdgpu_kernel void @tanh_f32_constant_4.0(ptr addrspace(1) %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: tanh_f32_constant_4.0:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_tanh_f32_e32 v0, 4.0
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
@@ -60,6 +64,7 @@ define amdgpu_kernel void @tanh_f32_constant_4.0(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @tanh_f32_constant_100.0(ptr addrspace(1) %out) #1 {
 ; SDAG-REAL16-LABEL: tanh_f32_constant_100.0:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-REAL16-NEXT:    v_tanh_f32_e32 v0, 0x42c80000
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
@@ -69,6 +74,7 @@ define amdgpu_kernel void @tanh_f32_constant_100.0(ptr addrspace(1) %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: tanh_f32_constant_100.0:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_tanh_f32_e32 v0, 0x42c80000
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
@@ -83,10 +89,12 @@ define amdgpu_kernel void @tanh_f32_constant_100.0(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @tanh_undef_f32(ptr addrspace(1) %out) #1 {
 ; SDAG-REAL16-LABEL: tanh_undef_f32:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; SDAG-FAKE16-LABEL: tanh_undef_f32:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_endpgm
   %tanh = call float @llvm.amdgcn.tanh.f32(float undef)
   store float %tanh, ptr addrspace(1) %out, align 4
@@ -96,6 +104,7 @@ define amdgpu_kernel void @tanh_undef_f32(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @tanh_f16(ptr addrspace(1) %out, half %src) #1 {
 ; SDAG-REAL16-LABEL: tanh_f16:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
@@ -105,6 +114,7 @@ define amdgpu_kernel void @tanh_f16(ptr addrspace(1) %out, half %src) #1 {
 ;
 ; SDAG-FAKE16-LABEL: tanh_f16:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
@@ -119,6 +129,7 @@ define amdgpu_kernel void @tanh_f16(ptr addrspace(1) %out, half %src) #1 {
 define amdgpu_kernel void @tanh_f16_constant_4.0(ptr addrspace(1) %out) #1 {
 ; SDAG-REAL16-LABEL: tanh_f16_constant_4.0:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-REAL16-NEXT:    v_tanh_f16_e32 v0.l, 4.0
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
@@ -128,6 +139,7 @@ define amdgpu_kernel void @tanh_f16_constant_4.0(ptr addrspace(1) %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: tanh_f16_constant_4.0:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_tanh_f16_e32 v0, 4.0
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
@@ -142,6 +154,7 @@ define amdgpu_kernel void @tanh_f16_constant_4.0(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @tanh_f16_constant_100.0(ptr addrspace(1) %out) #1 {
 ; SDAG-REAL16-LABEL: tanh_f16_constant_100.0:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-REAL16-NEXT:    v_tanh_f16_e32 v0.l, 0x5640
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
@@ -151,6 +164,7 @@ define amdgpu_kernel void @tanh_f16_constant_100.0(ptr addrspace(1) %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: tanh_f16_constant_100.0:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_tanh_f16_e32 v0, 0x5640
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
@@ -165,10 +179,12 @@ define amdgpu_kernel void @tanh_f16_constant_100.0(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @tanh_undef_f16(ptr addrspace(1) %out) #1 {
 ; SDAG-REAL16-LABEL: tanh_undef_f16:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; SDAG-FAKE16-LABEL: tanh_undef_f16:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_endpgm
   %tanh = call half @llvm.amdgcn.tanh.f16(half undef)
   store half %tanh, ptr addrspace(1) %out, align 2
@@ -178,6 +194,7 @@ define amdgpu_kernel void @tanh_undef_f16(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @tanh_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
 ; SDAG-REAL16-LABEL: tanh_bf16:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-REAL16-NEXT:    s_wait_kmcnt 0x0
@@ -187,6 +204,7 @@ define amdgpu_kernel void @tanh_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
 ;
 ; SDAG-FAKE16-LABEL: tanh_bf16:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-FAKE16-NEXT:    s_wait_kmcnt 0x0
@@ -201,6 +219,7 @@ define amdgpu_kernel void @tanh_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
 define amdgpu_kernel void @tanh_bf16_constant_4(ptr addrspace(1) %out) #1 {
 ; SDAG-REAL16-LABEL: tanh_bf16_constant_4:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-REAL16-NEXT:    v_tanh_bf16_e32 v0.l, 4.0
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
@@ -210,6 +229,7 @@ define amdgpu_kernel void @tanh_bf16_constant_4(ptr addrspace(1) %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: tanh_bf16_constant_4:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_tanh_bf16_e32 v0, 4.0
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
@@ -224,6 +244,7 @@ define amdgpu_kernel void @tanh_bf16_constant_4(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @tanh_bf16_constant_100(ptr addrspace(1) %out) #1 {
 ; SDAG-REAL16-LABEL: tanh_bf16_constant_100:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-REAL16-NEXT:    v_tanh_bf16_e32 v0.l, 0x42c8
 ; SDAG-REAL16-NEXT:    v_mov_b32_e32 v1, 0
@@ -233,6 +254,7 @@ define amdgpu_kernel void @tanh_bf16_constant_100(ptr addrspace(1) %out) #1 {
 ;
 ; SDAG-FAKE16-LABEL: tanh_bf16_constant_100:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; SDAG-FAKE16-NEXT:    v_tanh_bf16_e32 v0, 0x42c8
 ; SDAG-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
@@ -247,10 +269,12 @@ define amdgpu_kernel void @tanh_bf16_constant_100(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @tanh_undef_bf16(ptr addrspace(1) %out) #1 {
 ; SDAG-REAL16-LABEL: tanh_undef_bf16:
 ; SDAG-REAL16:       ; %bb.0:
+; SDAG-REAL16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-REAL16-NEXT:    s_endpgm
 ;
 ; SDAG-FAKE16-LABEL: tanh_undef_bf16:
 ; SDAG-FAKE16:       ; %bb.0:
+; SDAG-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-FAKE16-NEXT:    s_endpgm
   %tanh = call bfloat @llvm.amdgcn.tanh.bf16(bfloat undef)
   store bfloat %tanh, ptr addrspace(1) %out, align 2

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll
index d2712ac8e08a3..ab2c0f468c1c0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.tensor.load.store.ll
@@ -10,6 +10,7 @@ declare void @llvm.amdgcn.tensor.store.from.lds.d2(<4 x i32> %D0, <8 x i32> %D1,
 define amdgpu_ps void @tensor_load_to_lds(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3) {
 ; GFX1250-LABEL: tensor_load_to_lds:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    tensor_load_to_lds s[0:3], s[4:11], s[12:15], s[16:19]
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -20,6 +21,7 @@ entry:
 define amdgpu_ps void @tensor_load_to_lds_vector(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3) {
 ; GFX1250-SDAG-LABEL: tensor_load_to_lds_vector:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
 ; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s2, v6
@@ -46,6 +48,7 @@ define amdgpu_ps void @tensor_load_to_lds_vector(<4 x i32> %D0, <8 x i32> %D1, <
 ;
 ; GFX1250-GISEL-LABEL: tensor_load_to_lds_vector:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s10, v2
@@ -78,6 +81,7 @@ entry:
 define amdgpu_ps void @tensor_load_to_lds_d2(<4 x i32> inreg %D0, <8 x i32> inreg %D1) {
 ; GFX1250-LABEL: tensor_load_to_lds_d2:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    tensor_load_to_lds s[0:3], s[4:11] th:TH_LOAD_BYPASS scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -88,6 +92,7 @@ entry:
 define amdgpu_ps void @tensor_load_to_lds_d2_vector(<4 x i32> %D0, <8 x i32> %D1) {
 ; GFX1250-SDAG-LABEL: tensor_load_to_lds_d2_vector:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
@@ -106,6 +111,7 @@ define amdgpu_ps void @tensor_load_to_lds_d2_vector(<4 x i32> %D0, <8 x i32> %D1
 ;
 ; GFX1250-GISEL-LABEL: tensor_load_to_lds_d2_vector:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s10, v2
@@ -129,6 +135,7 @@ entry:
 define amdgpu_ps void @tensor_store_from_lds(<4 x i32> inreg %D0, <8 x i32> inreg %D1, <4 x i32> inreg %D2, <4 x i32> inreg %D3) {
 ; GFX1250-LABEL: tensor_store_from_lds:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    tensor_store_from_lds s[0:3], s[4:11], s[12:15], s[16:19] th:TH_STORE_NT_HT scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -139,6 +146,7 @@ entry:
 define amdgpu_ps void @tensor_store_from_lds_vector(<4 x i32> %D0, <8 x i32> %D1, <4 x i32> %D2, <4 x i32> %D3) {
 ; GFX1250-SDAG-LABEL: tensor_store_from_lds_vector:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s1, v5
 ; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s2, v6
@@ -165,6 +173,7 @@ define amdgpu_ps void @tensor_store_from_lds_vector(<4 x i32> %D0, <8 x i32> %D1
 ;
 ; GFX1250-GISEL-LABEL: tensor_store_from_lds_vector:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s10, v2
@@ -196,6 +205,7 @@ entry:
 define amdgpu_ps void @tensor_store_from_lds_d2(<4 x i32> inreg %D0, <8 x i32> inreg %D1) {
 ; GFX1250-LABEL: tensor_store_from_lds_d2:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    tensor_store_from_lds s[0:3], s[4:11]
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -206,6 +216,7 @@ entry:
 define amdgpu_ps void @tensor_store_from_lds_d2_vector(<4 x i32> %D0, <8 x i32> %D1) {
 ; GFX1250-SDAG-LABEL: tensor_store_from_lds_d2_vector:
 ; GFX1250-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s0, v4
 ; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX1250-SDAG-NEXT:    v_readfirstlane_b32 s9, v1
@@ -224,6 +235,7 @@ define amdgpu_ps void @tensor_store_from_lds_d2_vector(<4 x i32> %D0, <8 x i32>
 ;
 ; GFX1250-GISEL-LABEL: tensor_store_from_lds_d2_vector:
 ; GFX1250-GISEL:       ; %bb.0: ; %entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s8, v0
 ; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s9, v1
 ; GFX1250-GISEL-NEXT:    v_readfirstlane_b32 s10, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
index c597693d5a5f9..c95db198da4e7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wave.id.ll
@@ -29,6 +29,7 @@ define amdgpu_cs void @test_wave_id(ptr addrspace(1) %out) {
 ;
 ; GFX1250-LABEL: test_wave_id:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_bfe_u32 s0, ttmp8, 0x50019
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
index b6c930dacf9ab..1150578a5ae92 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
@@ -5,6 +5,7 @@
 define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
@@ -13,6 +14,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B,
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x4_f32:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[12:13], v[4:7], off
@@ -27,6 +29,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -35,6 +38,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloa
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -49,12 +53,14 @@ bb:
 define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -67,12 +73,14 @@ bb:
 define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
 ; GISEL-NEXT:    s_endpgm
@@ -85,6 +93,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -93,6 +102,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -107,6 +117,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -115,6 +126,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -129,6 +141,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -137,6 +150,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -151,6 +165,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -159,6 +174,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -173,12 +189,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -191,12 +209,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -209,12 +229,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -227,12 +249,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -245,6 +269,7 @@ bb:
 define amdgpu_ps void @test_wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -253,6 +278,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, <8
 ;
 ; GISEL-LABEL: test_wmma_i32_16x16x64_iu8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -267,6 +293,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -275,6 +302,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_f16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -289,12 +317,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x32_f16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] matrix_b_reuse
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -307,6 +337,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -315,6 +346,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -329,6 +361,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -337,6 +370,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -351,6 +385,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
@@ -359,6 +394,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off
@@ -373,6 +409,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
@@ -381,6 +418,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_bf6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_b_fmt:MATRIX_FMT_BF6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off
@@ -395,6 +433,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
@@ -403,6 +442,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp8_fp4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
@@ -417,6 +457,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -425,6 +466,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -439,6 +481,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -447,6 +490,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -461,6 +505,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
@@ -469,6 +514,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off
@@ -483,6 +529,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
@@ -491,6 +538,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_bf6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off
@@ -505,6 +553,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
@@ -513,6 +562,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf8_fp4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
@@ -527,6 +577,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
@@ -535,6 +586,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off
@@ -549,6 +601,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
@@ -557,6 +610,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off
@@ -571,6 +625,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
@@ -579,6 +634,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
@@ -593,6 +649,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
@@ -601,6 +658,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_bf6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
@@ -615,6 +673,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[28:29], v[24:27], off offset:16
@@ -623,6 +682,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp6_fp4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[28:29], v[20:23], off
@@ -637,6 +697,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
@@ -645,6 +706,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off
@@ -659,6 +721,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off offset:16
@@ -667,6 +730,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[28:31], off
@@ -681,6 +745,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
@@ -689,6 +754,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
@@ -703,6 +769,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
@@ -711,6 +778,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_bf6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
@@ -725,6 +793,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[28:29], v[24:27], off offset:16
@@ -733,6 +802,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_bf6_fp4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[28:29], v[20:23], off
@@ -747,6 +817,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
@@ -755,6 +826,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
@@ -769,6 +841,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[28:31], off offset:16
@@ -777,6 +850,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
@@ -791,6 +865,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[28:29], v[24:27], off offset:16
@@ -799,6 +874,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[28:29], v[20:23], off
@@ -813,6 +889,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[28:29], v[24:27], off offset:16
@@ -821,6 +898,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_bf6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[28:29], v[20:23], off
@@ -835,6 +913,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -843,6 +922,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_fp4_fp4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -857,6 +937,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
@@ -865,6 +946,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[42:43], v[32:35], off
@@ -879,6 +961,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_ss(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_ss:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, s1 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -887,6 +970,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_ss(<16 x i32> %A, <1
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_ss:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, s1 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -901,6 +985,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_si_scale(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_si_scale:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v42, 0x64
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, v42 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
@@ -911,6 +996,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_si_scale(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_si_scale:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_mov_b32_e32 v42, 0x64
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s0, v42 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
@@ -927,6 +1013,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_b_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
@@ -935,6 +1022,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_b_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[42:43], v[32:35], off
@@ -949,6 +1037,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_b_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[38:39], v[32:35], off offset:16
@@ -957,6 +1046,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_b_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[38:39], v[28:31], off
@@ -971,6 +1061,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_b_fmt:MATRIX_FMT_BF6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[38:39], v[32:35], off offset:16
@@ -979,6 +1070,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_bf6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_b_fmt:MATRIX_FMT_BF6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[38:39], v[28:31], off
@@ -993,6 +1085,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v32, v33 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
@@ -1001,6 +1094,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp8_fp4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v32, v33 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
@@ -1015,6 +1109,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
@@ -1023,6 +1118,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[42:43], v[32:35], off
@@ -1037,6 +1133,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:16
@@ -1045,6 +1142,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v40, v41 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[42:43], v[32:35], off
@@ -1059,6 +1157,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[38:39], v[32:35], off offset:16
@@ -1067,6 +1166,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[38:39], v[28:31], off
@@ -1081,6 +1181,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[38:39], v[32:35], off offset:16
@@ -1089,6 +1190,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_bf6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[38:39], v[28:31], off
@@ -1103,6 +1205,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
@@ -1111,6 +1214,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf8_fp4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
@@ -1125,6 +1229,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[38:39], v[32:35], off offset:16
@@ -1133,6 +1238,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[38:39], v[28:31], off
@@ -1147,6 +1253,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[38:39], v[32:35], off offset:16
@@ -1155,6 +1262,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[38:39], v[28:31], off
@@ -1169,6 +1277,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
@@ -1177,6 +1286,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
@@ -1191,6 +1301,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
@@ -1199,6 +1310,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_bf6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
@@ -1213,6 +1325,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off offset:16
@@ -1221,6 +1334,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp6_fp4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[30:31], v[20:23], off
@@ -1235,6 +1349,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[38:39], v[32:35], off offset:16
@@ -1243,6 +1358,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[38:39], v[28:31], off
@@ -1257,6 +1373,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[38:39], v[32:35], off offset:16
@@ -1265,6 +1382,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v36, v37 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[38:39], v[28:31], off
@@ -1279,6 +1397,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
@@ -1287,6 +1406,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
@@ -1301,6 +1421,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
@@ -1309,6 +1430,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_bf6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
@@ -1323,6 +1445,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off offset:16
@@ -1331,6 +1454,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_bf6_fp4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[30:31], v[20:23], off
@@ -1345,6 +1469,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
@@ -1353,6 +1478,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
@@ -1367,6 +1493,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
@@ -1375,6 +1502,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v32, v33 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
@@ -1389,6 +1517,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off offset:16
@@ -1397,6 +1526,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[30:31], v[20:23], off
@@ -1411,6 +1541,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off offset:16
@@ -1419,6 +1550,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_bf6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v28, v29 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[30:31], v[20:23], off
@@ -1433,6 +1565,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23], v24, v25 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[26:27], v[20:23], off offset:16
@@ -1441,6 +1574,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_fp4_fp4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23], v24, v25 matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[26:27], v[16:19], off
@@ -1455,6 +1589,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[44:45], v[36:39], off offset:16
@@ -1463,6 +1598,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4(<16 x i32> %A, <16
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[44:45], v[32:35], off
@@ -1477,6 +1613,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_ss(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_ss:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], s[2:3] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1485,6 +1622,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_ss(<16 x i32> %A,
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_ss:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], s[2:3] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1499,6 +1637,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b64_e32 v[42:43], 0x64
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
@@ -1509,6 +1648,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale(<16 x i32
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_si_scale:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_mov_b64_e32 v[42:43], 0x64
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], s[0:1], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
@@ -1525,6 +1665,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_b_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[44:45], v[36:39], off offset:16
@@ -1533,6 +1674,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf8(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_b_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[44:45], v[32:35], off
@@ -1547,6 +1689,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_b_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:16
@@ -1555,6 +1698,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp6(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_b_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off
@@ -1569,6 +1713,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_b_fmt:MATRIX_FMT_BF6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:16
@@ -1577,6 +1722,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf6(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_bf6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_b_fmt:MATRIX_FMT_BF6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off
@@ -1591,6 +1737,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v[32:33], v[34:35] matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
@@ -1599,6 +1746,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp4(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp8_fp4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v[32:33], v[34:35] matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
@@ -1613,6 +1761,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[44:45], v[36:39], off offset:16
@@ -1621,6 +1770,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp8(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[44:45], v[32:35], off
@@ -1635,6 +1785,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[44:45], v[36:39], off offset:16
@@ -1643,6 +1794,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf8(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], v[40:41], v[42:43] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[44:45], v[32:35], off
@@ -1657,6 +1809,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:16
@@ -1665,6 +1818,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp6(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off
@@ -1679,6 +1833,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:16
@@ -1687,6 +1842,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf6(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_bf6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:15], v[16:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_BF6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off
@@ -1701,6 +1857,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32> %A, <8 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
@@ -1709,6 +1866,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp4(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf8_fp4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:15], v[16:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF8 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
@@ -1723,6 +1881,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:16
@@ -1731,6 +1890,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp8(<12 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off
@@ -1745,6 +1905,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:16
@@ -1753,6 +1914,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf8(<12 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off
@@ -1767,6 +1929,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
@@ -1775,6 +1938,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp6(<12 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
@@ -1789,6 +1953,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
@@ -1797,6 +1962,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf6(<12 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_bf6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
@@ -1811,6 +1977,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off offset:16
@@ -1819,6 +1986,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp4(<12 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp6_fp4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[20:23], off
@@ -1833,6 +2001,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:16
@@ -1841,6 +2010,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp8(<12 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off
@@ -1855,6 +2025,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[32:35], off offset:16
@@ -1863,6 +2034,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf8(<12 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[28:35], v[0:11], v[12:27], v[28:35], v[36:37], v[38:39] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[28:31], off
@@ -1877,6 +2049,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
@@ -1885,6 +2058,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp6(<12 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
@@ -1899,6 +2073,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
@@ -1907,6 +2082,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf6(<12 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_bf6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:11], v[12:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
@@ -1921,6 +2097,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32> %A, <8 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off offset:16
@@ -1929,6 +2106,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp4(<12 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_bf6_fp4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:11], v[12:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_BF6 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[20:23], off
@@ -1943,6 +2121,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
@@ -1951,6 +2130,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp8(<8 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
@@ -1965,6 +2145,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[28:31], off offset:16
@@ -1973,6 +2154,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf8(<8 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[24:31], v[0:7], v[8:23], v[24:31], v[32:33], v[34:35] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_BF8
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[24:27], off
@@ -1987,6 +2169,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off offset:16
@@ -1995,6 +2178,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp6(<8 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP6
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[20:23], off
@@ -2009,6 +2193,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32> %A, <12 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf6:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[24:27], off offset:16
@@ -2017,6 +2202,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf6(<8 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_bf6:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[20:27], v[0:7], v[8:19], v[20:27], v[28:29], v[30:31] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[20:23], off
@@ -2031,6 +2217,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23], v[24:25], v[26:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[28:29], v[20:23], off offset:16
@@ -2039,6 +2226,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp4(<8 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_fp4_fp4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[16:23], v[0:7], v[8:15], v[16:23], v[24:25], v[26:27] matrix_a_fmt:MATRIX_FMT_FP4 matrix_b_fmt:MATRIX_FMT_FP4
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[28:29], v[16:19], off
@@ -2053,12 +2241,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GISEL-NEXT:    s_endpgm
@@ -2071,12 +2261,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GISEL-NEXT:    s_endpgm
@@ -2089,12 +2281,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GISEL-NEXT:    s_endpgm
@@ -2107,12 +2301,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] matrix_b_reuse
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GISEL-NEXT:    s_endpgm
@@ -2125,6 +2321,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -2133,6 +2330,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -2147,6 +2345,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -2155,6 +2354,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -2169,6 +2369,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -2177,6 +2378,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -2191,6 +2393,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -2199,6 +2402,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -2213,6 +2417,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39]
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
@@ -2223,6 +2428,7 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <
 ;
 ; GISEL-LABEL: test_wmma_f32_32x16x128_f4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39]
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
@@ -2239,6 +2445,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v40, v41 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[42:43], v[36:39], off offset:48
@@ -2249,6 +2456,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v40, v41 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[42:43], v[24:27], off
@@ -2265,6 +2473,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_ss:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
@@ -2275,6 +2484,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x i
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_ss:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, s1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
@@ -2291,6 +2501,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v42, 0x64
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
@@ -2303,6 +2514,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_si_scale(<16 x i32> %A,
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_si_scale:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_mov_b32_e32 v42, 0x64
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s0, v42 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
@@ -2321,6 +2533,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 %scale_src0, i64 %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v[40:41], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[44:45], v[36:39], off offset:48
@@ -2331,6 +2544,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i3
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v[40:41], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[44:45], v[24:27], off
@@ -2347,6 +2561,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_ss:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
@@ -2357,6 +2572,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_ss:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], s[2:3] matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_a_reuse
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
@@ -2373,6 +2589,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_si_scale(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i64 inreg %scale_src0, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b64_e32 v[42:43], 0x64
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
@@ -2385,6 +2602,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_si_scale(<16 x i32> %A
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_si_scale:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_mov_b64_e32 v[42:43], 0x64
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], s[0:1], v[42:43] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_a_scale_fmt:MATRIX_SCALE_FMT_E4M3 matrix_b_scale_fmt:MATRIX_SCALE_FMT_E5M3 matrix_b_reuse
@@ -2403,6 +2621,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
 ; GFX1250-NEXT:    s_clause 0x1
@@ -2412,6 +2631,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfl
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
 ; GISEL-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
 ; GISEL-NEXT:    s_clause 0x1
@@ -2427,6 +2647,7 @@ bb:
 define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse
 ; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
 ; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
@@ -2434,6 +2655,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16(<16 x bfloat> %A, <32 x bf
 ;
 ; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse
 ; GISEL-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
 ; GISEL-NEXT:    global_store_b128 v[30:31], v[24:27], off
@@ -2447,6 +2669,7 @@ bb:
 define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
 ; GFX1250-NEXT:    s_clause 0x1
@@ -2456,6 +2679,7 @@ define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16(<16 x bfloat> %A, <32 x
 ;
 ; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
 ; GISEL-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
 ; GISEL-NEXT:    s_clause 0x1
@@ -2471,6 +2695,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
@@ -2479,6 +2704,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
@@ -2493,6 +2719,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
@@ -2501,6 +2728,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
@@ -2515,6 +2743,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
@@ -2523,6 +2752,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
@@ -2537,6 +2767,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, i64 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
@@ -2545,6 +2776,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
@@ -2559,6 +2791,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
 ; GFX1250-NEXT:    v_mov_b32_e32 v29, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -2568,6 +2801,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8(<8 x i32> %A, <16 x i32
 ;
 ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
 ; GISEL-NEXT:    v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
@@ -2581,6 +2815,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
 ; GFX1250-NEXT:    v_mov_b32_e32 v29, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -2590,6 +2825,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8(<8 x i32> %A, <16 x i32
 ;
 ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
 ; GISEL-NEXT:    v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
@@ -2603,6 +2839,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
 ; GFX1250-NEXT:    v_mov_b32_e32 v29, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -2612,6 +2849,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8(<8 x i32> %A, <16 x i32
 ;
 ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
 ; GISEL-NEXT:    v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
@@ -2625,6 +2863,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
 ; GFX1250-NEXT:    v_mov_b32_e32 v29, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -2634,6 +2873,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8(<8 x i32> %A, <16 x i32
 ;
 ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] matrix_b_reuse
 ; GISEL-NEXT:    v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
@@ -2647,6 +2887,7 @@ bb:
 define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, i64 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
@@ -2655,6 +2896,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8(<8 x i32> %A, <16 x i32> %B
 ;
 ; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] matrix_b_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
@@ -2669,6 +2911,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
 ; GFX1250-NEXT:    s_clause 0x1
@@ -2678,6 +2921,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half>
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x64_f16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 matrix_b_reuse
 ; GISEL-NEXT:    v_dual_mov_b32 v36, v33 :: v_dual_mov_b32 v37, v34
 ; GISEL-NEXT:    s_clause 0x1
@@ -2693,6 +2937,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f16_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse
 ; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
 ; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
@@ -2700,6 +2945,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x64_f16(<16 x half> %A, <32 x half>
 ;
 ; GISEL-LABEL: test_swmmac_f16_16x16x64_f16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 matrix_b_reuse
 ; GISEL-NEXT:    v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
index dc477992db81e..037e26087eaa5 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imm.gfx1250.w32.ll
@@ -5,6 +5,7 @@
 define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], 1.0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
@@ -13,6 +14,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32(<2 x float> %A, <2 x float> %B,
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x4_f32:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x4_f32 v[6:13], v[0:1], v[2:3], 1.0
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[4:5], v[6:9], off
@@ -27,6 +29,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_splat(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v6, 1.0 :: v_dual_mov_b32 v8, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v9, v6
@@ -40,6 +43,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_splat(<2 x float> %A, <2 x
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x4_f32_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1.0
 ; GISEL-NEXT:    s_mov_b32 s2, 2.0
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -67,6 +71,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_inlineable(<2 x float> %A, <2 x float> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v6, 0x40400000
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
@@ -81,6 +86,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_non_inlineable(<2 x float> %A,
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x4_f32_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -109,6 +115,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], 1.0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
@@ -117,6 +124,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloa
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x32_bf16 v[18:25], v[0:7], v[8:15], 1.0
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
@@ -131,6 +139,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
@@ -144,6 +153,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_splat(<16 x bfloat> %A, <
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
@@ -163,6 +173,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_inlineable(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x40400000
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -177,6 +188,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_non_inlineable(<16 x bfloat>
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_mov_b32_e32 v18, 0x40400000
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -197,12 +209,14 @@ bb:
 define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0
 ; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_bf16_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
 ; GISEL-NEXT:    s_endpgm
@@ -215,6 +229,7 @@ bb:
 define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v18, 0x3f803f80 :: v_dual_mov_b32 v19, 1.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v20, v18 :: v_dual_mov_b32 v21, v18
@@ -224,6 +239,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_splat(<16 x bfloat> %A,
 ;
 ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_dual_mov_b32 v18, 0x3f803f80 :: v_dual_mov_b32 v19, 1.0
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    v_dual_mov_b32 v20, v18 :: v_dual_mov_b32 v21, v18
@@ -239,6 +255,7 @@ bb:
 define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_inlineable(<16 x bfloat> %A, <16 x bfloat> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x3fc03fc0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -249,6 +266,7 @@ define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_non_inlineable(<16 x bfloat>
 ;
 ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_mov_b32_e32 v18, 0x3fc03fc0
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -265,12 +283,14 @@ bb:
 define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0
 ; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[18:21], v[0:7], v[8:15], 1.0
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
 ; GISEL-NEXT:    s_endpgm
@@ -283,6 +303,7 @@ bb:
 define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_non_splat(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
@@ -294,6 +315,7 @@ define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_non_splat(<16 x bfloat> %
 ;
 ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
@@ -311,6 +333,7 @@ bb:
 define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_non_inlinable(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_inlinable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x40400000
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -323,6 +346,7 @@ define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_non_inlinable(<16 x bfloa
 ;
 ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_non_inlinable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_mov_b32_e32 v18, 0x40400000
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -341,6 +365,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], 1.0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
@@ -349,6 +374,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[18:25], v[0:7], v[8:15], 1.0
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
@@ -363,6 +389,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
@@ -376,6 +403,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1.0
 ; GISEL-NEXT:    s_mov_b32 s2, 2.0
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -403,6 +431,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x40400000
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -417,6 +446,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_non_inlineable(<8 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -445,6 +475,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], 1.0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
@@ -453,6 +484,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[18:25], v[0:7], v[8:15], 1.0
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
@@ -467,6 +499,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
@@ -480,6 +513,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1.0
 ; GISEL-NEXT:    s_mov_b32 s2, 2.0
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -507,6 +541,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x40400000
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -521,6 +556,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_non_inlineable(<8 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -549,6 +585,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], 1.0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
@@ -557,6 +594,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[18:25], v[0:7], v[8:15], 1.0
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
@@ -571,6 +609,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
@@ -584,6 +623,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1.0
 ; GISEL-NEXT:    s_mov_b32 s2, 2.0
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -611,6 +651,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x40400000
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -625,6 +666,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_non_inlineable(<8 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -653,6 +695,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], 1.0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
@@ -661,6 +704,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[18:25], v[0:7], v[8:15], 1.0
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
@@ -675,6 +719,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
@@ -688,6 +733,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1.0
 ; GISEL-NEXT:    s_mov_b32 s2, 2.0
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -715,6 +761,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x40400000
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -729,6 +776,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_non_inlineable(<8 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -757,12 +805,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], 1.0
 ; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[18:21], v[0:7], v[8:15], 1.0
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
 ; GISEL-NEXT:    s_endpgm
@@ -775,6 +825,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x3c003c00
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18
@@ -785,6 +836,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_splat(<8 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
 ; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -804,6 +856,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x42004200
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -814,6 +867,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_non_inlineable(<8 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -833,12 +887,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], 1.0 neg_hi:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[18:21], v[0:7], v[8:15], 1.0 neg_hi:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
 ; GISEL-NEXT:    s_endpgm
@@ -851,6 +907,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x3c003c00
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18
@@ -861,6 +918,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_splat(<8 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
 ; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -880,6 +938,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x42004200
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -890,6 +949,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_non_inlineable(<8 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -909,12 +969,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], 1.0
 ; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[18:21], v[0:7], v[8:15], 1.0
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
 ; GISEL-NEXT:    s_endpgm
@@ -927,6 +989,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x3c003c00
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18
@@ -937,6 +1000,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_splat(<8 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
 ; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -956,6 +1020,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x42004200
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -966,6 +1031,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_non_inlineable(<8 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -985,12 +1051,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], 1.0
 ; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[18:21], v[0:7], v[8:15], 1.0
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
 ; GISEL-NEXT:    s_endpgm
@@ -1003,6 +1071,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x3c003c00
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18
@@ -1013,6 +1082,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_splat(<8 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
 ; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -1032,6 +1102,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x42004200
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -1042,6 +1113,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_non_inlineable(<8 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -1061,6 +1133,7 @@ bb:
 define amdgpu_ps void @test_wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
@@ -1069,6 +1142,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8(<8 x i32> %A, <8 x i32> %B, pt
 ;
 ; GISEL-LABEL: test_wmma_i32_16x16x64_iu8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_i32_16x16x64_iu8 v[18:25], v[0:7], v[8:15], 1
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
@@ -1083,6 +1157,7 @@ bb:
 define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_splat(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v18, 1 :: v_dual_mov_b32 v20, 2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
@@ -1096,6 +1171,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_splat(<8 x i32> %A, <8 x i
 ;
 ; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1
 ; GISEL-NEXT:    s_mov_b32 s2, 2
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -1123,6 +1199,7 @@ bb:
 define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_inlineable(<8 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x80
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -1137,6 +1214,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_non_inlineable(<8 x i32> %A, <
 ;
 ; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_movk_i32 s0, 0x80
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -1165,6 +1243,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], 1.0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[16:17], v[22:25], off offset:16
@@ -1173,6 +1252,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16(<16 x half> %A, <16 x half> %B
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_f16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x32_f16 v[18:25], v[0:7], v[8:15], 1.0
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
@@ -1187,6 +1267,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_splat(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v18, 1.0 :: v_dual_mov_b32 v20, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v21, v18
@@ -1200,6 +1281,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_splat(<16 x half> %A, <16
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_f16_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1.0
 ; GISEL-NEXT:    s_mov_b32 s2, 2.0
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -1227,6 +1309,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_inlineable(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x40400000
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -1241,6 +1324,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_non_inlineable(<16 x half> %A,
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_f16_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -1269,12 +1353,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x32_f16(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], 1.0
 ; GFX1250-NEXT:    global_store_b128 v[16:17], v[18:21], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x32_f16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x32_f16 v[18:21], v[0:7], v[8:15], 1.0
 ; GISEL-NEXT:    global_store_b128 v[16:17], v[18:21], off
 ; GISEL-NEXT:    s_endpgm
@@ -1287,6 +1373,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_splat(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x3c003c00
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, 0x3c004000 :: v_dual_mov_b32 v20, v18
@@ -1297,6 +1384,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_splat(<16 x half> %A, <16
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x32_f16_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
 ; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -1316,6 +1404,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_inlineable(<16 x half> %A, <16 x half> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v18, 0x42004200
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v19, v18 :: v_dual_mov_b32 v20, v18
@@ -1326,6 +1415,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x32_f16_non_inlineable(<16 x half> %A,
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x32_f16_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -1345,6 +1435,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
@@ -1353,6 +1444,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
@@ -1367,6 +1459,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34
@@ -1380,6 +1473,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1.0
 ; GISEL-NEXT:    s_mov_b32 s1, 2.0
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -1407,6 +1501,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x40400000
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
@@ -1421,6 +1516,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -1449,6 +1545,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
@@ -1457,6 +1554,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
@@ -1471,6 +1569,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34
@@ -1484,6 +1583,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_splat(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1.0
 ; GISEL-NEXT:    s_mov_b32 s1, 2.0
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -1511,6 +1611,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x40400000
 ; GFX1250-NEXT:    v_mov_b32_e32 v43, 0x64
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_1)
@@ -1526,6 +1627,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable(<16 x
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
 ; GISEL-NEXT:    v_mov_b32_e32 v42, 0x64
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -1555,6 +1657,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4(<16 x i32> %A, <16 x i32> %B, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
@@ -1563,6 +1666,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4(<16 x i32> %A, <16
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[34:41], v[0:15], v[16:31], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
@@ -1577,6 +1681,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v35, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v36, v34 :: v_dual_mov_b32 v37, v34
@@ -1590,6 +1695,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_splat(<16 x i3
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1.0
 ; GISEL-NEXT:    s_mov_b32 s1, 2.0
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -1617,6 +1723,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x40400000
 ; GFX1250-NEXT:    v_mov_b64_e32 v[42:43], 0x65
 ; GFX1250-NEXT:    v_mov_b64_e32 v[44:45], 0x64
@@ -1633,6 +1740,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable(<16
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
 ; GISEL-NEXT:    v_mov_b64_e32 v[42:43], 0x64
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -1662,12 +1770,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], 1.0
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[34:37], v[0:15], v[16:31], 1.0
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
 ; GISEL-NEXT:    s_endpgm
@@ -1680,6 +1790,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x3c003c00
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34
@@ -1690,6 +1801,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_splat(<16 x i32> %A,
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
 ; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -1709,6 +1821,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x42004200
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
@@ -1719,6 +1832,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_non_inlineable(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -1738,12 +1852,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], 1.0 neg_hi:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[34:37], v[0:15], v[16:31], 1.0 neg_hi:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
 ; GISEL-NEXT:    s_endpgm
@@ -1756,6 +1872,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x3c003c00
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34
@@ -1766,6 +1883,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_splat(<16 x i32> %A,
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
 ; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -1785,6 +1903,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x42004200
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
@@ -1795,6 +1914,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_non_inlineable(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -1814,12 +1934,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], 1.0
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[34:37], v[0:15], v[16:31], 1.0
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
 ; GISEL-NEXT:    s_endpgm
@@ -1832,6 +1954,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x3c003c00
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34
@@ -1842,6 +1965,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_splat(<16 x i32> %A,
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
 ; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -1861,6 +1985,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x42004200
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
@@ -1871,6 +1996,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_non_inlineable(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -1890,12 +2016,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], 1.0
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[34:37], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[34:37], v[0:15], v[16:31], 1.0
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
 ; GISEL-NEXT:    s_endpgm
@@ -1908,6 +2036,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x3c003c00
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, 0x3c004000 :: v_dual_mov_b32 v36, v34
@@ -1918,6 +2047,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_splat(<16 x i32> %A,
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x3c003c00
 ; GISEL-NEXT:    s_mov_b32 s1, 0x3c004000
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -1937,6 +2067,7 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x42004200
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
@@ -1947,6 +2078,7 @@ define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_non_inlineable(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x42004200
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    s_mov_b32 s2, s0
@@ -1966,6 +2098,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], 1.0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
@@ -1974,6 +2107,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8(<16 x i32> %A, <16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[34:41], v[0:15], v[16:31], 1.0
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
@@ -1988,6 +2122,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34
@@ -2001,6 +2136,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_splat(<16 x i32> %A,
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1.0
 ; GISEL-NEXT:    s_mov_b32 s2, 2.0
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -2028,6 +2164,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x40400000
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
@@ -2042,6 +2179,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_non_inlineable(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -2070,6 +2208,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], 1.0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
@@ -2078,6 +2217,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8(<16 x i32> %A, <16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[34:41], v[0:15], v[16:31], 1.0
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
@@ -2092,6 +2232,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34
@@ -2105,6 +2246,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_splat(<16 x i32> %A,
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1.0
 ; GISEL-NEXT:    s_mov_b32 s2, 2.0
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -2132,6 +2274,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x40400000
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
@@ -2146,6 +2289,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_non_inlineable(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -2174,6 +2318,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], 1.0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
@@ -2182,6 +2327,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8(<16 x i32> %A, <16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[34:41], v[0:15], v[16:31], 1.0
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
@@ -2196,6 +2342,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34
@@ -2209,6 +2356,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_splat(<16 x i32> %A,
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1.0
 ; GISEL-NEXT:    s_mov_b32 s2, 2.0
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -2236,6 +2384,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x40400000
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
@@ -2250,6 +2399,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_non_inlineable(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -2278,6 +2428,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], 1.0
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[32:33], v[38:41], off offset:16
@@ -2286,6 +2437,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8(<16 x i32> %A, <16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[34:41], v[0:15], v[16:31], 1.0
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[34:37], off
@@ -2300,6 +2452,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_splat(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v34, 1.0 :: v_dual_mov_b32 v36, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v37, v34
@@ -2313,6 +2466,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_splat(<16 x i32> %A,
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1.0
 ; GISEL-NEXT:    s_mov_b32 s2, 2.0
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -2340,6 +2494,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_inlineable(<16 x i32> %A, <16 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v34, 0x40400000
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v36, v34
@@ -2354,6 +2509,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_non_inlineable(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    s_mov_b32 s6, s0
@@ -2382,6 +2538,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
@@ -2392,6 +2549,7 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, p
 ;
 ; GISEL-LABEL: test_wmma_f32_32x16x128_f4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
@@ -2408,6 +2566,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v28, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v29, v26
@@ -2428,6 +2587,7 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x
 ;
 ; GISEL-LABEL: test_wmma_f32_32x16x128_f4_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1.0
 ; GISEL-NEXT:    s_mov_b32 s2, 2.0
 ; GISEL-NEXT:    s_mov_b32 s14, s0
@@ -2469,6 +2629,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v26, 0x40400000
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v27, v26 :: v_dual_mov_b32 v28, v26
@@ -2490,6 +2651,7 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_non_inlineable(<16 x i32> %A,
 ;
 ; GISEL-LABEL: test_wmma_f32_32x16x128_f4_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GISEL-NEXT:    s_mov_b32 s14, s0
@@ -2532,6 +2694,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
@@ -2542,6 +2705,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s0, s1 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
@@ -2558,6 +2722,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v27, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v28, v26 :: v_dual_mov_b32 v29, v26
@@ -2578,6 +2743,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_splat(<16 x i32> %A,
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1.0
 ; GISEL-NEXT:    s_mov_b32 s1, 2.0
 ; GISEL-NEXT:    s_mov_b32 s14, s0
@@ -2619,6 +2785,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v26, 0x40400000
 ; GFX1250-NEXT:    v_mov_b32_e32 v43, 0x64
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2)
@@ -2641,6 +2808,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_non_inlineable(<16 x i32
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
 ; GISEL-NEXT:    v_mov_b32_e32 v42, 0x64
 ; GISEL-NEXT:    s_mov_b32 s14, s0
@@ -2684,6 +2852,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, i64 inreg %scale_src0, i64 inreg %scale_src1, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[38:41], off offset:48
@@ -2694,6 +2863,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4(<16 x i32> %A, <8 x i3
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[26:41], v[0:15], v[16:23], 1.0, s[0:1], s[2:3] matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 matrix_a_reuse
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
@@ -2710,6 +2880,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_splat(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_splat:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v26, 1.0 :: v_dual_mov_b32 v27, 2.0
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v28, v26 :: v_dual_mov_b32 v29, v26
@@ -2730,6 +2901,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_splat(<16 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_splat:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 1.0
 ; GISEL-NEXT:    s_mov_b32 s1, 2.0
 ; GISEL-NEXT:    s_mov_b32 s14, s0
@@ -2771,6 +2943,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i32> %A, <8 x i32> %B, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v26, 0x40400000
 ; GFX1250-NEXT:    v_mov_b64_e32 v[42:43], 0x65
 ; GFX1250-NEXT:    v_mov_b64_e32 v[44:45], 0x64
@@ -2794,6 +2967,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_non_inlineable(<16 x i
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_non_inlineable:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s0, 0x40400000
 ; GISEL-NEXT:    v_mov_b64_e32 v[42:43], 0x64
 ; GISEL-NEXT:    s_mov_b32 s14, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
index 8f674f84206ff..eb7c15587654c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.imod.gfx1250.w32.ll
@@ -5,6 +5,7 @@
 define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negA(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_negA:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
@@ -13,6 +14,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negA(<2 x float> %A, <2 x float
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x4_f32_negA:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[12:13], v[4:7], off
@@ -27,6 +29,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negB(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_negB:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
@@ -35,6 +38,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negB(<2 x float> %A, <2 x float
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x4_f32_negB:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[12:13], v[4:7], off
@@ -49,6 +53,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negC(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
@@ -57,6 +62,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_negC(<2 x float> %A, <2 x float
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x4_f32_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[12:13], v[4:7], off
@@ -71,6 +77,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x4_f32_neg_absC(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
@@ -79,6 +86,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_neg_absC(<2 x float> %A, <2 x f
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x4_f32_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[12:13], v[4:7], off
@@ -93,6 +101,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x4_f32_ignoreC(<2 x float> %A, <2 x float> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x4_f32_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
@@ -101,6 +110,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x4_f32_ignoreC(<2 x float> %A, <2 x fl
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x4_f32_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x4_f32 v[4:11], v[0:1], v[2:3], v[4:11]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[12:13], v[4:7], off
@@ -115,6 +125,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_negA:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -123,6 +134,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_negA:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -137,6 +149,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negB(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_negB:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -145,6 +158,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negB(<16 x bfloat> %A, <16 x
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_negB:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -159,6 +173,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -167,6 +182,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -181,6 +197,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_neg_absC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -189,6 +206,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_neg_absC(<16 x bfloat> %A, <1
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -203,6 +221,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_ignoreC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_bf16_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -211,6 +230,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_bf16_ignoreC(<16 x bfloat> %A, <16
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_bf16_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x32_bf16 v[16:23], v[0:7], v[8:15], v[16:23]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -225,12 +245,14 @@ bb:
 define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_negA:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_negA:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -243,12 +265,14 @@ bb:
 define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_negB(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_negB:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_negB:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -261,12 +285,14 @@ bb:
 define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -279,12 +305,14 @@ bb:
 define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_neg_absC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -297,12 +325,14 @@ bb:
 define amdgpu_ps void @test_wmma_bf16_16x16x32_bf16_ignoreC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16_16x16x32_bf16_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_bf16_16x16x32_bf16_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_bf16_16x16x32_bf16 v[16:19], v[0:7], v[8:15], v[16:19]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -315,12 +345,14 @@ bb:
 define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_negA(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_negA:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_negA:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
 ; GISEL-NEXT:    s_endpgm
@@ -333,12 +365,14 @@ bb:
 define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_negB(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_negB:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_negB:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
 ; GISEL-NEXT:    s_endpgm
@@ -351,12 +385,14 @@ bb:
 define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_negC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
 ; GISEL-NEXT:    s_endpgm
@@ -369,12 +405,14 @@ bb:
 define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_neg_absC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
 ; GISEL-NEXT:    s_endpgm
@@ -387,12 +425,14 @@ bb:
 define amdgpu_ps void @test_wmma_bf16f32_16x16x32_bf16_ignoreC(<16 x bfloat> %A, <16 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_bf16f32_16x16x32_bf16_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[26:29], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_bf16f32_16x16x32_bf16_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_bf16f32_16x16x32_bf16 v[26:29], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[26:29], off
 ; GISEL-NEXT:    s_endpgm
@@ -405,6 +445,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_negC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -413,6 +454,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_negC(<8 x i32> %A, <8 x i3
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -427,6 +469,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -435,6 +478,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_neg_absC(<8 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -449,6 +493,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_fp8_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -457,6 +502,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_fp8_ignoreC(<8 x i32> %A, <8 x
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_fp8_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_fp8 v[16:23], v[0:7], v[8:15], v[16:23]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -471,6 +517,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_negC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -479,6 +526,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_negC(<8 x i32> %A, <8 x i3
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -493,6 +541,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -501,6 +550,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_neg_absC(<8 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -515,6 +565,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_fp8_bf8_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -523,6 +574,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_fp8_bf8_ignoreC(<8 x i32> %A, <8 x
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_fp8_bf8_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_fp8_bf8 v[16:23], v[0:7], v[8:15], v[16:23]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -537,6 +589,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_negC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -545,6 +598,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_negC(<8 x i32> %A, <8 x i3
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -559,6 +613,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -567,6 +622,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_neg_absC(<8 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -581,6 +637,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_fp8_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -589,6 +646,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_fp8_ignoreC(<8 x i32> %A, <8 x
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_fp8_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_fp8 v[16:23], v[0:7], v[8:15], v[16:23]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -603,6 +661,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_negC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -611,6 +670,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_negC(<8 x i32> %A, <8 x i3
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -625,6 +685,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -633,6 +694,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_neg_absC(<8 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -647,6 +709,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x64_bf8_bf8_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -655,6 +718,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x64_bf8_bf8_ignoreC(<8 x i32> %A, <8 x
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x64_bf8_bf8_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x64_bf8_bf8 v[16:23], v[0:7], v[8:15], v[16:23]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -669,12 +733,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_negC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -687,12 +753,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -705,12 +773,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_fp8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_fp8_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_fp8_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_fp8 v[16:19], v[0:7], v[8:15], v[16:19]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -723,12 +793,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_negC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -741,12 +813,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -759,12 +833,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_fp8_bf8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_fp8_bf8_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_fp8_bf8_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_fp8_bf8 v[16:19], v[0:7], v[8:15], v[16:19]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -777,12 +853,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_negC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -795,12 +873,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -813,12 +893,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_fp8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_fp8_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_fp8_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_fp8 v[16:19], v[0:7], v[8:15], v[16:19]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -831,12 +913,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_negC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -849,12 +933,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_neg_absC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -867,12 +953,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x64_bf8_bf8_ignoreC(<8 x i32> %A, <8 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x64_bf8_bf8_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x64_bf8_bf8_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x64_bf8_bf8 v[16:19], v[0:7], v[8:15], v[16:19]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -885,6 +973,7 @@ bb:
 define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_signedA(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_signedA:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -893,6 +982,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_signedA(<8 x i32> %A, <8 x i32
 ;
 ; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_signedA:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -907,6 +997,7 @@ bb:
 define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_signedB(<8 x i32> %A, <8 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_i32_16x16x64_iu8_signedB:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -915,6 +1006,7 @@ define amdgpu_ps void @test_wmma_i32_16x16x64_iu8_signedB(<8 x i32> %A, <8 x i32
 ;
 ; GISEL-LABEL: test_wmma_i32_16x16x64_iu8_signedB:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_i32_16x16x64_iu8 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -929,6 +1021,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negA(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_negA:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -937,6 +1030,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negA(<16 x half> %A, <16 x hal
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_f16_negA:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -951,6 +1045,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negB(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_negB:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -959,6 +1054,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negB(<16 x half> %A, <16 x hal
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_f16_negB:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -973,6 +1069,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negC(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -981,6 +1078,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_negC(<16 x half> %A, <16 x hal
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_f16_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -995,6 +1093,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_f16_neg_absC(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -1003,6 +1102,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_neg_absC(<16 x half> %A, <16 x
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_f16_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -1017,6 +1117,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x32_f16_ignoreC(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x32_f16_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[24:25], v[20:23], off offset:16
@@ -1025,6 +1126,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x32_f16_ignoreC(<16 x half> %A, <16 x
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x32_f16_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x32_f16 v[16:23], v[0:7], v[8:15], v[16:23]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[24:25], v[16:19], off
@@ -1039,12 +1141,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x32_f16_negA(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_negA:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x32_f16_negA:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -1057,12 +1161,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x32_f16_negB(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_negB:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x32_f16_negB:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -1075,12 +1181,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x32_f16_negC(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x32_f16_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -1093,12 +1201,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x32_f16_neg_absC(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x32_f16_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -1111,12 +1221,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x32_f16_ignoreC(<16 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x32_f16_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19]
 ; GFX1250-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x32_f16_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x32_f16 v[16:19], v[0:7], v[8:15], v[16:19]
 ; GISEL-NEXT:    global_store_b128 v[20:21], v[16:19], off
 ; GISEL-NEXT:    s_endpgm
@@ -1129,6 +1241,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1137,6 +1250,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1151,6 +1265,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1159,6 +1274,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <1
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1173,6 +1289,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1181,6 +1298,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_f8f6f4_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1195,6 +1313,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1203,6 +1322,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_negC(<16 x i32> %A,
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1217,6 +1337,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1225,6 +1346,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_neg_absC(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1239,6 +1361,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1247,6 +1370,7 @@ define amdgpu_ps void @test_wmma_scale_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_16x16x128_f8f6f4_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1261,6 +1385,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1269,6 +1394,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_negC(<16 x i32> %A
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1283,6 +1409,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1291,6 +1418,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_neg_absC(<16 x i32
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1305,6 +1433,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1313,6 +1442,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_16x16x128_f8f6f4_ignoreC(<16 x i32>
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_16x16x128_f8f6f4_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_16x16x128_f8f6f4 v[32:39], v[0:15], v[16:31], v[32:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1327,12 +1457,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GISEL-NEXT:    s_endpgm
@@ -1345,12 +1477,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GISEL-NEXT:    s_endpgm
@@ -1363,12 +1497,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_fp8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_fp8_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35]
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_fp8_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_fp8 v[32:35], v[0:15], v[16:31], v[32:35]
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GISEL-NEXT:    s_endpgm
@@ -1381,12 +1517,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GISEL-NEXT:    s_endpgm
@@ -1399,12 +1537,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GISEL-NEXT:    s_endpgm
@@ -1417,12 +1557,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_fp8_bf8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_fp8_bf8_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35]
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_fp8_bf8_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_fp8_bf8 v[32:35], v[0:15], v[16:31], v[32:35]
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GISEL-NEXT:    s_endpgm
@@ -1435,12 +1577,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GISEL-NEXT:    s_endpgm
@@ -1453,12 +1597,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GISEL-NEXT:    s_endpgm
@@ -1471,12 +1617,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_fp8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_fp8_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35]
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_fp8_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_fp8 v[32:35], v[0:15], v[16:31], v[32:35]
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GISEL-NEXT:    s_endpgm
@@ -1489,12 +1637,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_negC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GISEL-NEXT:    s_endpgm
@@ -1507,12 +1657,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GISEL-NEXT:    s_endpgm
@@ -1525,12 +1677,14 @@ bb:
 define amdgpu_ps void @test_wmma_f16_16x16x128_bf8_bf8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f16_16x16x128_bf8_bf8_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35]
 ; GFX1250-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GISEL-LABEL: test_wmma_f16_16x16x128_bf8_bf8_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f16_16x16x128_bf8_bf8 v[32:35], v[0:15], v[16:31], v[32:35]
 ; GISEL-NEXT:    global_store_b128 v[36:37], v[32:35], off
 ; GISEL-NEXT:    s_endpgm
@@ -1543,6 +1697,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1551,6 +1706,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_negC(<16 x i32> %A, <16 x
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1565,6 +1721,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1573,6 +1730,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_neg_absC(<16 x i32> %A, <
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1587,6 +1745,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_fp8_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1595,6 +1754,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_fp8_ignoreC(<16 x i32> %A, <1
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_fp8_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_fp8 v[32:39], v[0:15], v[16:31], v[32:39]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1609,6 +1769,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1617,6 +1778,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_negC(<16 x i32> %A, <16 x
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1631,6 +1793,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1639,6 +1802,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_neg_absC(<16 x i32> %A, <
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1653,6 +1817,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_fp8_bf8_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1661,6 +1826,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_fp8_bf8_ignoreC(<16 x i32> %A, <1
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_fp8_bf8_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_fp8_bf8 v[32:39], v[0:15], v[16:31], v[32:39]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1675,6 +1841,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1683,6 +1850,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_negC(<16 x i32> %A, <16 x
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1697,6 +1865,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1705,6 +1874,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_neg_absC(<16 x i32> %A, <
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1719,6 +1889,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_fp8_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1727,6 +1898,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_fp8_ignoreC(<16 x i32> %A, <1
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_fp8_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_fp8 v[32:39], v[0:15], v[16:31], v[32:39]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1741,6 +1913,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_negC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1749,6 +1922,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_negC(<16 x i32> %A, <16 x
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1763,6 +1937,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_neg_absC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1771,6 +1946,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_neg_absC(<16 x i32> %A, <
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1785,6 +1961,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_ignoreC(<16 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_16x16x128_bf8_bf8_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:16
@@ -1793,6 +1970,7 @@ define amdgpu_ps void @test_wmma_f32_16x16x128_bf8_bf8_ignoreC(<16 x i32> %A, <1
 ;
 ; GISEL-LABEL: test_wmma_f32_16x16x128_bf8_bf8_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_16x16x128_bf8_bf8 v[32:39], v[0:15], v[16:31], v[32:39]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[32:35], off
@@ -1807,6 +1985,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
@@ -1817,6 +1996,7 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32>
 ;
 ; GISEL-LABEL: test_wmma_f32_32x16x128_f4_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
@@ -1833,6 +2013,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
@@ -1843,6 +2024,7 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i
 ;
 ; GISEL-LABEL: test_wmma_f32_32x16x128_f4_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39] neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
@@ -1859,6 +2041,7 @@ bb:
 define amdgpu_ps void @test_wmma_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_f32_32x16x128_f4_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39]
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
@@ -1869,6 +2052,7 @@ define amdgpu_ps void @test_wmma_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i3
 ;
 ; GISEL-LABEL: test_wmma_f32_32x16x128_f4_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39]
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
@@ -1885,6 +2069,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
@@ -1895,6 +2080,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
@@ -1911,6 +2097,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
@@ -1921,6 +2108,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_neg_absC(<16 x i32> %A,
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
@@ -1937,6 +2125,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
@@ -1947,6 +2136,7 @@ define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <
 ;
 ; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
@@ -1963,6 +2153,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_negC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_negC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
@@ -1973,6 +2164,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_negC(<16 x i32> %A, <8
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_negC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
@@ -1989,6 +2181,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_neg_absC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_neg_absC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
@@ -1999,6 +2192,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_neg_absC(<16 x i32> %A
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_neg_absC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1 neg_lo:[0,0,1] neg_hi:[0,0,1]
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
@@ -2015,6 +2209,7 @@ bb:
 define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_ignoreC(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_wmma_scale16_f32_32x16x128_f4_ignoreC:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
 ; GFX1250-NEXT:    s_clause 0x3
 ; GFX1250-NEXT:    global_store_b128 v[40:41], v[36:39], off offset:48
@@ -2025,6 +2220,7 @@ define amdgpu_ps void @test_wmma_scale16_f32_32x16x128_f4_ignoreC(<16 x i32> %A,
 ;
 ; GISEL-LABEL: test_wmma_scale16_f32_32x16x128_f4_ignoreC:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_wmma_scale16_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], 2, 4 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
 ; GISEL-NEXT:    s_clause 0x3
 ; GISEL-NEXT:    global_store_b128 v[40:41], v[24:27], off
@@ -2041,6 +2237,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16_negA(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16_negA:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
 ; GFX1250-NEXT:    s_clause 0x1
@@ -2050,6 +2247,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16_negA(<16 x bfloat> %A, <32
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16_negA:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GISEL-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
 ; GISEL-NEXT:    s_clause 0x1
@@ -2065,6 +2263,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16_negB(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16_negB:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
 ; GFX1250-NEXT:    s_clause 0x1
@@ -2074,6 +2273,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16_negB(<16 x bfloat> %A, <32
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16_negB:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GISEL-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
 ; GISEL-NEXT:    s_clause 0x1
@@ -2089,6 +2289,7 @@ bb:
 define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16_negA(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16_negA:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
 ; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
@@ -2096,6 +2297,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16_negA(<16 x bfloat> %A, <32
 ;
 ; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16_negA:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GISEL-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
 ; GISEL-NEXT:    global_store_b128 v[30:31], v[24:27], off
@@ -2109,6 +2311,7 @@ bb:
 define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16_negB(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16_negB:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
 ; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
@@ -2116,6 +2319,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16_negB(<16 x bfloat> %A, <32
 ;
 ; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16_negB:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GISEL-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
 ; GISEL-NEXT:    global_store_b128 v[30:31], v[24:27], off
@@ -2129,6 +2333,7 @@ bb:
 define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16_negA(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16_negA:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
 ; GFX1250-NEXT:    s_clause 0x1
@@ -2138,6 +2343,7 @@ define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16_negA(<16 x bfloat> %A,
 ;
 ; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16_negA:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GISEL-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
 ; GISEL-NEXT:    s_clause 0x1
@@ -2153,6 +2359,7 @@ bb:
 define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16_negB(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16_negB:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
 ; GFX1250-NEXT:    s_clause 0x1
@@ -2162,6 +2369,7 @@ define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16_negB(<16 x bfloat> %A,
 ;
 ; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16_negB:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GISEL-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
 ; GISEL-NEXT:    s_clause 0x1
@@ -2177,6 +2385,7 @@ bb:
 define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_signedA(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, i64 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_signedA:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
@@ -2185,6 +2394,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_signedA(<8 x i32> %A, <16 x
 ;
 ; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_signedA:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[1,0,0]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
@@ -2199,6 +2409,7 @@ bb:
 define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_signedB(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, i64 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_signedB:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0]
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b128 v[34:35], v[28:31], off offset:16
@@ -2207,6 +2418,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_signedB(<8 x i32> %A, <16 x
 ;
 ; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_signedB:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] neg_lo:[0,1,0]
 ; GISEL-NEXT:    s_clause 0x1
 ; GISEL-NEXT:    global_store_b128 v[34:35], v[24:27], off
@@ -2221,6 +2433,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x64_f16_negA(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16_negA:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
 ; GFX1250-NEXT:    s_clause 0x1
@@ -2230,6 +2443,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_f16_negA(<16 x half> %A, <32 x h
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x64_f16_negA:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GISEL-NEXT:    v_dual_mov_b32 v36, v33 :: v_dual_mov_b32 v37, v34
 ; GISEL-NEXT:    s_clause 0x1
@@ -2245,6 +2459,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x64_f16_negB(<16 x half> %A, <32 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16_negB:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX1250-NEXT:    v_dual_mov_b32 v35, v34 :: v_dual_mov_b32 v34, v33
 ; GFX1250-NEXT:    s_clause 0x1
@@ -2254,6 +2469,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_f16_negB(<16 x half> %A, <32 x h
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x64_f16_negB:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GISEL-NEXT:    v_dual_mov_b32 v36, v33 :: v_dual_mov_b32 v37, v34
 ; GISEL-NEXT:    s_clause 0x1
@@ -2269,6 +2485,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f16_16x16x64_f16_negA(<16 x half> %A, <32 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16_negA:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
 ; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
@@ -2276,6 +2493,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x64_f16_negA(<16 x half> %A, <32 x h
 ;
 ; GISEL-LABEL: test_swmmac_f16_16x16x64_f16_negA:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[1,0,0] neg_hi:[1,0,0]
 ; GISEL-NEXT:    v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off
@@ -2289,6 +2507,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f16_16x16x64_f16_negB(<16 x half> %A, <32 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16_negB:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GFX1250-NEXT:    v_dual_mov_b32 v31, v30 :: v_dual_mov_b32 v30, v29
 ; GFX1250-NEXT:    global_store_b128 v[30:31], v[24:27], off
@@ -2296,6 +2515,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x64_f16_negB(<16 x half> %A, <32 x h
 ;
 ; GISEL-LABEL: test_swmmac_f16_16x16x64_f16_negB:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 neg_lo:[0,1,0] neg_hi:[0,1,0]
 ; GISEL-NEXT:    v_dual_mov_b32 v32, v29 :: v_dual_mov_b32 v33, v30
 ; GISEL-NEXT:    global_store_b128 v[32:33], v[24:27], off

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.index.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.index.gfx1250.w32.ll
index b8745e0ebf480..537c43d02909b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.index.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.index.gfx1250.w32.ll
@@ -5,6 +5,7 @@
 define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x64_bf16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b32 v32, v[32:33], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1
@@ -15,6 +16,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfl
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x64_bf16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b32 v32, v[32:33], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1
@@ -34,6 +36,7 @@ bb:
 define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x bfloat> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_bf16_16x16x64_bf16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b32 v28, v[28:29], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1
@@ -42,6 +45,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x64_bf16(<16 x bfloat> %A, <32 x bf
 ;
 ; GISEL-LABEL: test_swmmac_bf16_16x16x64_bf16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b32 v28, v[28:29], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_bf16_16x16x64_bf16 v[24:27], v[0:7], v[8:23], v28 index_key:1
@@ -59,6 +63,7 @@ bb:
 define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16(<16 x bfloat> %A, <32 x bfloat> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_bf16f32_16x16x64_bf16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b32 v32, v[32:33], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1
@@ -69,6 +74,7 @@ define amdgpu_ps void @test_swmmac_bf16f32_16x16x64_bf16(<16 x bfloat> %A, <32 x
 ;
 ; GISEL-LABEL: test_swmmac_bf16f32_16x16x64_bf16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b32 v32, v[32:33], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_bf16f32_16x16x64_bf16 v[24:31], v[0:7], v[8:23], v32 index_key:1
@@ -88,6 +94,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i32_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -99,6 +106,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8_i32_index(<8 x i32> %A,
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i32_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -120,6 +128,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i64_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -131,6 +140,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_fp8_i64_index(<8 x i32> %A,
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_fp8_i64_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f32_16x16x128_fp8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -151,6 +161,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i32_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -162,6 +173,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8_i32_index(<8 x i32> %A,
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i32_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -183,6 +195,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i64_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -194,6 +207,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_fp8_bf8_i64_index(<8 x i32> %A,
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x128_fp8_bf8_i64_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f32_16x16x128_fp8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -214,6 +228,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i32_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -225,6 +240,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8_i32_index(<8 x i32> %A,
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i32_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -246,6 +262,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i64_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -257,6 +274,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_fp8_i64_index(<8 x i32> %A,
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_fp8_i64_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f32_16x16x128_bf8_fp8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -277,6 +295,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i32_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -288,6 +307,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8_i32_index(<8 x i32> %A,
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i32_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -309,6 +329,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i64_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -320,6 +341,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x128_bf8_bf8_i64_index(<8 x i32> %A,
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x128_bf8_bf8_i64_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f32_16x16x128_bf8_bf8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -340,6 +362,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i32_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[28:29], v[28:29], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
@@ -349,6 +372,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8_i32_index(<8 x i32> %A,
 ;
 ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i32_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[28:29], v[28:29], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
@@ -368,6 +392,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i64_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[28:29], v[28:29], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
@@ -377,6 +402,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_fp8_i64_index(<8 x i32> %A,
 ;
 ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_fp8_i64_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[28:29], v[28:29], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f16_16x16x128_fp8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
@@ -395,6 +421,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i32_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[28:29], v[28:29], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
@@ -404,6 +431,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8_i32_index(<8 x i32> %A,
 ;
 ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i32_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[28:29], v[28:29], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
@@ -423,6 +451,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i64_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[28:29], v[28:29], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
@@ -432,6 +461,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_fp8_bf8_i64_index(<8 x i32> %A,
 ;
 ; GISEL-LABEL: test_swmmac_f16_16x16x128_fp8_bf8_i64_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[28:29], v[28:29], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f16_16x16x128_fp8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
@@ -450,6 +480,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i32_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[28:29], v[28:29], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
@@ -459,6 +490,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8_i32_index(<8 x i32> %A,
 ;
 ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i32_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[28:29], v[28:29], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
@@ -478,6 +510,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i64_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[28:29], v[28:29], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
@@ -487,6 +520,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_fp8_i64_index(<8 x i32> %A,
 ;
 ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_fp8_i64_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[28:29], v[28:29], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f16_16x16x128_bf8_fp8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
@@ -505,6 +539,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i32_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[28:29], v[28:29], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
@@ -514,6 +549,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8_i32_index(<8 x i32> %A,
 ;
 ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i32_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[28:29], v[28:29], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
@@ -533,6 +569,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i64_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[28:29], v[28:29], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
@@ -542,6 +579,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x128_bf8_bf8_i64_index(<8 x i32> %A,
 ;
 ; GISEL-LABEL: test_swmmac_f16_16x16x128_bf8_bf8_i64_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[28:29], v[28:29], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f16_16x16x128_bf8_bf8 v[24:27], v[0:7], v[8:23], v[28:29] index_key:1
@@ -560,6 +598,7 @@ bb:
 define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_i32_index(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_i32_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -571,6 +610,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_i32_index(<8 x i32> %A, <16
 ;
 ; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_i32_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -592,6 +632,7 @@ bb:
 define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_i64_index(<8 x i32> %A, <16 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %IndexVecOutPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_i32_16x16x128_iu8_i64_index:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -603,6 +644,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x128_iu8_i64_index(<8 x i32> %A, <16
 ;
 ; GISEL-LABEL: test_swmmac_i32_16x16x128_iu8_i64_index:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b64 v[32:33], v[32:33], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_i32_16x16x128_iu8 v[24:31], v[0:7], v[8:23], v[32:33] index_key:1
@@ -623,6 +665,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f32_16x16x64_f16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b32 v32, v[32:33], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1
@@ -633,6 +676,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x64_f16(<16 x half> %A, <32 x half>
 ;
 ; GISEL-LABEL: test_swmmac_f32_16x16x64_f16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b32 v32, v[32:33], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f32_16x16x64_f16 v[24:31], v[0:7], v[8:23], v32 index_key:1
@@ -652,6 +696,7 @@ bb:
 define amdgpu_ps void @test_swmmac_f16_16x16x64_f16(<16 x half> %A, <32 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_swmmac_f16_16x16x64_f16:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_load_b32 v28, v[28:29], off
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1
@@ -660,6 +705,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x64_f16(<16 x half> %A, <32 x half>
 ;
 ; GISEL-LABEL: test_swmmac_f16_16x16x64_f16:
 ; GISEL:       ; %bb.0: ; %bb
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    global_load_b32 v28, v[28:29], off
 ; GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GISEL-NEXT:    v_swmmac_f16_16x16x64_f16 v[24:27], v[0:7], v[8:23], v28 index_key:1

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll
index 0948530abdf1b..fe98261e2f96c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.cos.bf16.ll
@@ -9,10 +9,12 @@ declare bfloat @llvm.cos.bf16(bfloat) #0
 define amdgpu_kernel void @cos_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
 ; GCN-LABEL: cos_bf16:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; GCN-NEXT:    s_mov_b32 s3, 0x3e230000
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GCN-NEXT:    v_fma_mixlo_bf16 v0, s2, s3, 0 op_sel_hi:[1,0,0]
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    v_cos_bf16_e32 v0, v0
@@ -26,6 +28,7 @@ define amdgpu_kernel void @cos_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
 define amdgpu_kernel void @cos_bf16_constant_4(ptr addrspace(1) %out) #1 {
 ; GCN-LABEL: cos_bf16_constant_4:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GCN-NEXT:    v_cos_bf16_e32 v0, 0x3f23
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -40,6 +43,7 @@ define amdgpu_kernel void @cos_bf16_constant_4(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @cos_bf16_constant_100(ptr addrspace(1) %out) #1 {
 ; GCN-LABEL: cos_bf16_constant_100:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GCN-NEXT:    v_cos_bf16_e32 v0, 0x417f
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
index c265b05813ee7..a5a70bb811a65 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.prefetch.ll
@@ -17,18 +17,25 @@
 define amdgpu_ps void @prefetch_data_sgpr(ptr addrspace(4) inreg %ptr) {
 ; GFX1250-LABEL: prefetch_data_sgpr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
-; SPREFETCH-LABEL: prefetch_data_sgpr:
-; SPREFETCH:       ; %bb.0: ; %entry
-; SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
-; SPREFETCH-NEXT:    s_endpgm
+; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr:
+; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
 ; NOSPREFETCH-LABEL: prefetch_data_sgpr:
 ; NOSPREFETCH:       ; %bb.0: ; %entry
 ; NOSPREFETCH-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 1)
   ret void
@@ -37,18 +44,25 @@ entry:
 define amdgpu_ps void @prefetch_data_sgpr_offset(ptr addrspace(4) inreg %ptr) {
 ; GFX1250-LABEL: prefetch_data_sgpr_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    global_prefetch_b8 v0, s[0:1] offset:512 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
-; SPREFETCH-LABEL: prefetch_data_sgpr_offset:
-; SPREFETCH:       ; %bb.0: ; %entry
-; SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x200, null, 0
-; SPREFETCH-NEXT:    s_endpgm
+; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_offset:
+; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x200, null, 0
+; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
 ; NOSPREFETCH-LABEL: prefetch_data_sgpr_offset:
 ; NOSPREFETCH:       ; %bb.0: ; %entry
 ; NOSPREFETCH-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_offset:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x200, null, 0
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   %gep = getelementptr float, ptr addrspace(4) %ptr, i32 128
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
@@ -60,18 +74,25 @@ entry:
 define amdgpu_ps void @prefetch_data_sgpr_max_offset(ptr addrspace(4) inreg %ptr) {
 ; GFX1250-LABEL: prefetch_data_sgpr_max_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    global_prefetch_b8 v0, s[0:1] offset:8388607 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
-; SPREFETCH-LABEL: prefetch_data_sgpr_max_offset:
-; SPREFETCH:       ; %bb.0: ; %entry
-; SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x7fffff, null, 0
-; SPREFETCH-NEXT:    s_endpgm
+; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_max_offset:
+; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x7fffff, null, 0
+; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
 ; NOSPREFETCH-LABEL: prefetch_data_sgpr_max_offset:
 ; NOSPREFETCH:       ; %bb.0: ; %entry
 ; NOSPREFETCH-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_max_offset:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x7fffff, null, 0
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 1)
@@ -81,12 +102,14 @@ entry:
 define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
 ; GFX1250-LABEL: prefetch_data_sgpr_min_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    global_prefetch_b8 v0, s[0:1] offset:-8388608 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_data_sgpr_min_offset:
 ; GFX1250-SPREFETCH-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-SDAG-NEXT:    s_mov_b64 s[2:3], 0xffffffffff800000
 ; GFX1250-SPREFETCH-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SPREFETCH-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
@@ -108,6 +131,7 @@ define amdgpu_ps void @prefetch_data_sgpr_min_offset(ptr addrspace(4) inreg %ptr
 ;
 ; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_min_offset:
 ; GFX1250-SPREFETCH-GISEL:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xff800000
 ; GFX1250-SPREFETCH-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
 ; GFX1250-SPREFETCH-GISEL-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
@@ -128,12 +152,14 @@ entry:
 define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) {
 ; GFX1250-LABEL: prefetch_data_sgpr_too_large_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0x800000
 ; GFX1250-NEXT:    global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_data_sgpr_too_large_offset:
 ; GFX1250-SPREFETCH-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x800000
 ; GFX1250-SPREFETCH-SDAG-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
 ; GFX1250-SPREFETCH-SDAG-NEXT:    s_endpgm
@@ -150,6 +176,7 @@ define amdgpu_ps void @prefetch_data_sgpr_too_large_offset(ptr addrspace(4) inre
 ;
 ; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_data_sgpr_too_large_offset:
 ; GFX1250-SPREFETCH-GISEL:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x800000
 ; GFX1250-SPREFETCH-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
 ; GFX1250-SPREFETCH-GISEL-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
@@ -172,11 +199,13 @@ entry:
 define amdgpu_ps void @prefetch_data_vgpr_global(ptr addrspace(1) %ptr) {
 ; GFX1250-LABEL: prefetch_data_vgpr_global:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_prefetch_b8 v[0:1], off scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global:
 ; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-NEXT:    global_prefetch_b8 v[0:1], off scope:SCOPE_SYS
 ; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
@@ -195,11 +224,13 @@ entry:
 define amdgpu_ps void @prefetch_data_vgpr_flat(ptr %ptr) {
 ; GFX1250-LABEL: prefetch_data_vgpr_flat:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_prefetch_b8 v[0:1] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat:
 ; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-NEXT:    flat_prefetch_b8 v[0:1] scope:SCOPE_SYS
 ; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
@@ -218,11 +249,13 @@ entry:
 define amdgpu_ps void @prefetch_data_sgpr_vgpr_offset_global(ptr addrspace(1) inreg %ptr, i32 %offset) {
 ; GFX1250-LABEL: prefetch_data_sgpr_vgpr_offset_global:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_global:
 ; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-NEXT:    global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
 ; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
@@ -248,11 +281,13 @@ entry:
 define amdgpu_ps void @prefetch_data_sgpr_vgpr_offset_flat(ptr inreg %ptr, i32 %offset) {
 ; GFX1250-LABEL: prefetch_data_sgpr_vgpr_offset_flat:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_vgpr_offset_flat:
 ; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-NEXT:    flat_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS
 ; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
@@ -279,18 +314,46 @@ entry:
 ; Check LDS and Scratch, we cannot prefetch it
 
 define amdgpu_ps void @prefetch_data_lds(ptr addrspace(3) inreg %ptr) {
-; GCN-LABEL: prefetch_data_lds:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_endpgm
+; GFX1250-LABEL: prefetch_data_lds:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_lds:
+; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SPREFETCH-NEXT:    s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_lds:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_lds:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.p3(ptr addrspace(3) %ptr, i32 0, i32 0, i32 1)
   ret void
 }
 
 define amdgpu_ps void @prefetch_data_scratch(ptr addrspace(5) inreg %ptr) {
-; GCN-LABEL: prefetch_data_scratch:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_endpgm
+; GFX1250-LABEL: prefetch_data_scratch:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_data_scratch:
+; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SPREFETCH-NEXT:    s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_data_scratch:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_scratch:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.p5(ptr addrspace(5) %ptr, i32 0, i32 0, i32 1)
   ret void
@@ -301,18 +364,25 @@ entry:
 define amdgpu_ps void @prefetch_data_sgpr_flat(ptr inreg %ptr) {
 ; GFX1250-LABEL: prefetch_data_sgpr_flat:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    flat_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
-; SPREFETCH-LABEL: prefetch_data_sgpr_flat:
-; SPREFETCH:       ; %bb.0: ; %entry
-; SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
-; SPREFETCH-NEXT:    s_endpgm
+; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_flat:
+; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
 ; NOSPREFETCH-LABEL: prefetch_data_sgpr_flat:
 ; NOSPREFETCH:       ; %bb.0: ; %entry
 ; NOSPREFETCH-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_flat:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 1)
   ret void
@@ -321,18 +391,25 @@ entry:
 define amdgpu_ps void @prefetch_data_sgpr_global(ptr addrspace(1) inreg %ptr) {
 ; GFX1250-LABEL: prefetch_data_sgpr_global:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
-; SPREFETCH-LABEL: prefetch_data_sgpr_global:
-; SPREFETCH:       ; %bb.0: ; %entry
-; SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
-; SPREFETCH-NEXT:    s_endpgm
+; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_global:
+; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
 ; NOSPREFETCH-LABEL: prefetch_data_sgpr_global:
 ; NOSPREFETCH:       ; %bb.0: ; %entry
 ; NOSPREFETCH-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_global:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 1)
   ret void
@@ -341,17 +418,25 @@ entry:
 define amdgpu_ps void @prefetch_data_sgpr_constant_32bit(ptr addrspace(6) inreg %ptr) {
 ; GFX1250-LABEL: prefetch_data_sgpr_constant_32bit:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 ;
-; SPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit:
-; SPREFETCH:       ; %bb.0: ; %entry
-; SPREFETCH-NEXT:    s_mov_b32 s1, 0
-; SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
-; SPREFETCH-NEXT:    s_endpgm
+; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit:
+; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SPREFETCH-NEXT:    s_mov_b32 s1, 0
+; GFX1250-SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
 ; NOSPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit:
 ; NOSPREFETCH:       ; %bb.0: ; %entry
 ; NOSPREFETCH-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_data_sgpr_constant_32bit:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_mov_b32 s1, 0
+; GFX12-SPREFETCH-NEXT:    s_prefetch_data s[0:1], 0x0, null, 0
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.p6(ptr addrspace(6) %ptr, i32 0, i32 0, i32 1)
   ret void
@@ -362,16 +447,23 @@ entry:
 define amdgpu_ps void @prefetch_inst_sgpr(ptr addrspace(4) inreg %ptr) {
 ; GFX1250-LABEL: prefetch_inst_sgpr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 ;
-; SPREFETCH-LABEL: prefetch_inst_sgpr:
-; SPREFETCH:       ; %bb.0: ; %entry
-; SPREFETCH-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
-; SPREFETCH-NEXT:    s_endpgm
+; GFX1250-SPREFETCH-LABEL: prefetch_inst_sgpr:
+; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SPREFETCH-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
 ; NOSPREFETCH-LABEL: prefetch_inst_sgpr:
 ; NOSPREFETCH:       ; %bb.0: ; %entry
 ; NOSPREFETCH-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_inst_sgpr:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %ptr, i32 0, i32 0, i32 0)
   ret void
@@ -380,16 +472,23 @@ entry:
 define amdgpu_ps void @prefetch_inst_sgpr_offset(ptr addrspace(4) inreg %ptr) {
 ; GFX1250-LABEL: prefetch_inst_sgpr_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 ;
-; SPREFETCH-LABEL: prefetch_inst_sgpr_offset:
-; SPREFETCH:       ; %bb.0: ; %entry
-; SPREFETCH-NEXT:    s_prefetch_inst s[0:1], 0x80, null, 0
-; SPREFETCH-NEXT:    s_endpgm
+; GFX1250-SPREFETCH-LABEL: prefetch_inst_sgpr_offset:
+; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SPREFETCH-NEXT:    s_prefetch_inst s[0:1], 0x80, null, 0
+; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
 ; NOSPREFETCH-LABEL: prefetch_inst_sgpr_offset:
 ; NOSPREFETCH:       ; %bb.0: ; %entry
 ; NOSPREFETCH-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_inst_sgpr_offset:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_prefetch_inst s[0:1], 0x80, null, 0
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 128
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
@@ -401,16 +500,23 @@ entry:
 define amdgpu_ps void @prefetch_inst_sgpr_max_offset(ptr addrspace(4) inreg %ptr) {
 ; GFX1250-LABEL: prefetch_inst_sgpr_max_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 ;
-; SPREFETCH-LABEL: prefetch_inst_sgpr_max_offset:
-; SPREFETCH:       ; %bb.0: ; %entry
-; SPREFETCH-NEXT:    s_prefetch_inst s[0:1], 0x7fffff, null, 0
-; SPREFETCH-NEXT:    s_endpgm
+; GFX1250-SPREFETCH-LABEL: prefetch_inst_sgpr_max_offset:
+; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SPREFETCH-NEXT:    s_prefetch_inst s[0:1], 0x7fffff, null, 0
+; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
 ; NOSPREFETCH-LABEL: prefetch_inst_sgpr_max_offset:
 ; NOSPREFETCH:       ; %bb.0: ; %entry
 ; NOSPREFETCH-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_inst_sgpr_max_offset:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_prefetch_inst s[0:1], 0x7fffff, null, 0
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   %gep = getelementptr i8, ptr addrspace(4) %ptr, i32 8388607
   tail call void @llvm.prefetch.p4(ptr addrspace(4) %gep, i32 0, i32 0, i32 0)
@@ -420,10 +526,12 @@ entry:
 define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr) {
 ; GFX1250-LABEL: prefetch_inst_sgpr_min_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_inst_sgpr_min_offset:
 ; GFX1250-SPREFETCH-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-SDAG-NEXT:    s_mov_b64 s[2:3], 0xffffffffff800000
 ; GFX1250-SPREFETCH-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SPREFETCH-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[2:3]
@@ -445,6 +553,7 @@ define amdgpu_ps void @prefetch_inst_sgpr_min_offset(ptr addrspace(4) inreg %ptr
 ;
 ; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_min_offset:
 ; GFX1250-SPREFETCH-GISEL:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-GISEL-NEXT:    s_add_co_u32 s0, s0, 0xff800000
 ; GFX1250-SPREFETCH-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, -1
 ; GFX1250-SPREFETCH-GISEL-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
@@ -465,10 +574,12 @@ entry:
 define amdgpu_ps void @prefetch_inst_sgpr_too_large_offset(ptr addrspace(4) inreg %ptr) {
 ; GFX1250-LABEL: prefetch_inst_sgpr_too_large_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-SDAG-LABEL: prefetch_inst_sgpr_too_large_offset:
 ; GFX1250-SPREFETCH-SDAG:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-SDAG-NEXT:    s_add_nc_u64 s[0:1], s[0:1], 0x800000
 ; GFX1250-SPREFETCH-SDAG-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
 ; GFX1250-SPREFETCH-SDAG-NEXT:    s_endpgm
@@ -485,6 +596,7 @@ define amdgpu_ps void @prefetch_inst_sgpr_too_large_offset(ptr addrspace(4) inre
 ;
 ; GFX1250-SPREFETCH-GISEL-LABEL: prefetch_inst_sgpr_too_large_offset:
 ; GFX1250-SPREFETCH-GISEL:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-GISEL-NEXT:    s_add_co_u32 s0, s0, 0x800000
 ; GFX1250-SPREFETCH-GISEL-NEXT:    s_add_co_ci_u32 s1, s1, 0
 ; GFX1250-SPREFETCH-GISEL-NEXT:    s_prefetch_inst s[0:1], 0x0, null, 0
@@ -507,11 +619,13 @@ entry:
 define amdgpu_ps void @prefetch_data_vgpr_flat_dev(ptr %ptr) {
 ; GFX1250-LABEL: prefetch_data_vgpr_flat_dev:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_prefetch_b8 v[0:1] scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_dev:
 ; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-NEXT:    flat_prefetch_b8 v[0:1] scope:SCOPE_DEV
 ; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
@@ -530,11 +644,13 @@ entry:
 define amdgpu_ps void @prefetch_data_vgpr_flat_se(ptr %ptr) {
 ; GFX1250-LABEL: prefetch_data_vgpr_flat_se:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_prefetch_b8 v[0:1] scope:SCOPE_SE
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_se:
 ; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-NEXT:    flat_prefetch_b8 v[0:1] scope:SCOPE_SE
 ; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
@@ -553,16 +669,19 @@ entry:
 define amdgpu_ps void @prefetch_data_vgpr_flat_cu(ptr %ptr) {
 ; GL2-ONLY-LABEL: prefetch_data_vgpr_flat_cu:
 ; GL2-ONLY:       ; %bb.0: ; %entry
+; GL2-ONLY-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GL2-ONLY-NEXT:    flat_prefetch_b8 v[0:1] scope:SCOPE_SE
 ; GL2-ONLY-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_cu:
 ; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-NEXT:    flat_prefetch_b8 v[0:1] scope:SCOPE_SE
 ; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
 ; SAFE-CU-LABEL: prefetch_data_vgpr_flat_cu:
 ; SAFE-CU:       ; %bb.0: ; %entry
+; SAFE-CU-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SAFE-CU-NEXT:    flat_prefetch_b8 v[0:1]
 ; SAFE-CU-NEXT:    s_endpgm
 ;
@@ -583,11 +702,13 @@ entry:
 define amdgpu_ps void @prefetch_data_vgpr_flat_offset(ptr %ptr) {
 ; GFX1250-LABEL: prefetch_data_vgpr_flat_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    flat_prefetch_b8 v[0:1] offset:512 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_flat_offset:
 ; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-NEXT:    flat_prefetch_b8 v[0:1] offset:512 scope:SCOPE_SYS
 ; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
@@ -607,11 +728,13 @@ entry:
 define amdgpu_ps void @prefetch_data_vgpr_global_offset(ptr addrspace(1) %ptr) {
 ; GFX1250-LABEL: prefetch_data_vgpr_global_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_prefetch_b8 v[0:1], off offset:512 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global_offset:
 ; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-NEXT:    global_prefetch_b8 v[0:1], off offset:512 scope:SCOPE_SYS
 ; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
@@ -631,11 +754,13 @@ entry:
 define amdgpu_ps void @prefetch_data_vgpr_global_saddr(ptr addrspace(1) inreg %ptr, i32 %voffset) {
 ; GFX1250-LABEL: prefetch_data_vgpr_global_saddr:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global_saddr:
 ; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-NEXT:    global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
 ; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
@@ -655,11 +780,13 @@ entry:
 define amdgpu_ps void @prefetch_data_vgpr_global_saddr_offset(ptr addrspace(1) inreg %ptr, i32 %voffset) {
 ; GFX1250-LABEL: prefetch_data_vgpr_global_saddr_offset:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-LABEL: prefetch_data_vgpr_global_saddr_offset:
 ; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-NEXT:    global_prefetch_b8 v0, s[0:1] offset:128 scope:SCOPE_SYS
 ; GFX1250-SPREFETCH-NEXT:    s_endpgm
 ;
@@ -680,18 +807,46 @@ entry:
 ; Cannot prefetch I$ with flat or global instructions.
 
 define amdgpu_ps void @prefetch_inst_vgpr_global(ptr addrspace(1) %ptr) {
-; GCN-LABEL: prefetch_inst_vgpr_global:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_endpgm
+; GFX1250-LABEL: prefetch_inst_vgpr_global:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_inst_vgpr_global:
+; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SPREFETCH-NEXT:    s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_inst_vgpr_global:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_inst_vgpr_global:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.p1(ptr addrspace(1) %ptr, i32 0, i32 0, i32 0)
   ret void
 }
 
 define amdgpu_ps void @prefetch_inst_vgpr_flat(ptr %ptr) {
-; GCN-LABEL: prefetch_inst_vgpr_flat:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_endpgm
+; GFX1250-LABEL: prefetch_inst_vgpr_flat:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_endpgm
+;
+; GFX1250-SPREFETCH-LABEL: prefetch_inst_vgpr_flat:
+; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-SPREFETCH-NEXT:    s_endpgm
+;
+; NOSPREFETCH-LABEL: prefetch_inst_vgpr_flat:
+; NOSPREFETCH:       ; %bb.0: ; %entry
+; NOSPREFETCH-NEXT:    s_endpgm
+;
+; GFX12-SPREFETCH-LABEL: prefetch_inst_vgpr_flat:
+; GFX12-SPREFETCH:       ; %bb.0: ; %entry
+; GFX12-SPREFETCH-NEXT:    s_endpgm
 entry:
   tail call void @llvm.prefetch.pf(ptr %ptr, i32 0, i32 0, i32 0)
   ret void
@@ -702,12 +857,14 @@ entry:
 define amdgpu_ps void @prefetch_data_sgpr_flat_force_vector(ptr inreg %ptr) {
 ; GFX1250-LABEL: prefetch_data_sgpr_flat_force_vector:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    flat_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_flat_force_vector:
 ; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-SPREFETCH-NEXT:    flat_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
 ; GFX1250-SPREFETCH-NEXT:    s_endpgm
@@ -728,12 +885,14 @@ entry:
 define amdgpu_ps void @prefetch_data_sgpr_global_force_vector(ptr addrspace(1) inreg %ptr) {
 ; GFX1250-LABEL: prefetch_data_sgpr_global_force_vector:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_global_force_vector:
 ; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-SPREFETCH-NEXT:    global_prefetch_b8 v0, s[0:1] scope:SCOPE_SYS
 ; GFX1250-SPREFETCH-NEXT:    s_endpgm
@@ -754,12 +913,14 @@ entry:
 define amdgpu_ps void @prefetch_data_sgpr_global_saddr_force_vector(ptr addrspace(1) inreg %ptr) {
 ; GFX1250-LABEL: prefetch_data_sgpr_global_saddr_force_vector:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    global_prefetch_b8 v0, s[0:1] offset:1024 scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_endpgm
 ;
 ; GFX1250-SPREFETCH-LABEL: prefetch_data_sgpr_global_saddr_force_vector:
 ; GFX1250-SPREFETCH:       ; %bb.0: ; %entry
+; GFX1250-SPREFETCH-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SPREFETCH-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-SPREFETCH-NEXT:    global_prefetch_b8 v0, s[0:1] offset:1024 scope:SCOPE_SYS
 ; GFX1250-SPREFETCH-NEXT:    s_endpgm
@@ -784,3 +945,6 @@ declare void @llvm.prefetch.p3(ptr addrspace(3) nocapture readonly, i32, i32, i3
 declare void @llvm.prefetch.p4(ptr addrspace(4) nocapture readonly, i32, i32, i32)
 declare void @llvm.prefetch.p5(ptr addrspace(5) nocapture readonly, i32, i32, i32)
 declare void @llvm.prefetch.p6(ptr addrspace(6) nocapture readonly, i32, i32, i32)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
+; SPREFETCH: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll
index fe8ace52c06bf..c44b12eaa6686 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sin.bf16.ll
@@ -9,10 +9,12 @@ declare bfloat @llvm.sin.bf16(bfloat) #0
 define amdgpu_kernel void @sin_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
 ; GCN-LABEL: sin_bf16:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; GCN-NEXT:    s_mov_b32 s3, 0x3e230000
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 2, 2), 0
 ; GCN-NEXT:    v_fma_mixlo_bf16 v0, s2, s3, 0 op_sel_hi:[1,0,0]
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    v_sin_bf16_e32 v0, v0
@@ -26,6 +28,7 @@ define amdgpu_kernel void @sin_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
 define amdgpu_kernel void @sin_bf16_constant_4(ptr addrspace(1) %out) #1 {
 ; GCN-LABEL: sin_bf16_constant_4:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GCN-NEXT:    v_sin_bf16_e32 v0, 0x3f23
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
@@ -40,6 +43,7 @@ define amdgpu_kernel void @sin_bf16_constant_4(ptr addrspace(1) %out) #1 {
 define amdgpu_kernel void @sin_bf16_constant_100(ptr addrspace(1) %out) #1 {
 ; GCN-LABEL: sin_bf16_constant_100:
 ; GCN:       ; %bb.0:
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GCN-NEXT:    v_sin_bf16_e32 v0, 0x417f
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
index 818dff4924f40..bbbf4b66cf55b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.sqrt.bf16.ll
@@ -8,6 +8,7 @@ declare <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> %a)
 define amdgpu_kernel void @sqrt_bf16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX12-TRUE16-LABEL: sqrt_bf16:
 ; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -26,6 +27,7 @@ define amdgpu_kernel void @sqrt_bf16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ;
 ; GFX12-FAKE16-LABEL: sqrt_bf16:
 ; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -51,6 +53,7 @@ entry:
 define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a) {
 ; GFX12-TRUE16-LABEL: sqrt_v2bf16:
 ; GFX12-TRUE16:       ; %bb.0: ; %entry
+; GFX12-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s6, -1
 ; GFX12-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
@@ -72,6 +75,7 @@ define amdgpu_kernel void @sqrt_v2bf16(ptr addrspace(1) %r, ptr addrspace(1) %a)
 ;
 ; GFX12-FAKE16-LABEL: sqrt_v2bf16:
 ; GFX12-FAKE16:       ; %bb.0: ; %entry
+; GFX12-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s6, -1
 ; GFX12-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index 9fdc72f054f90..876940da7f575 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -78,6 +78,7 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace
 ;
 ; GFX1250-LABEL: constant_load_i1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_u8 s2, s[2:3], 0x0
@@ -161,6 +162,7 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX1250-LABEL: constant_load_v2i1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -241,6 +243,7 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX1250-LABEL: constant_load_v3i1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -322,6 +325,7 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX1250-LABEL: constant_load_v4i1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -403,6 +407,7 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX1250-LABEL: constant_load_v8i1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -484,6 +489,7 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: constant_load_v16i1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -549,6 +555,7 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: constant_load_v32i1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -617,6 +624,7 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: constant_load_v64i1:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -689,6 +697,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt
 ;
 ; GFX1250-LABEL: constant_zextload_i1_to_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_u8 s2, s[2:3], 0x0
@@ -766,6 +775,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt
 ;
 ; GFX1250-LABEL: constant_sextload_i1_to_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_u8 s2, s[2:3], 0x0
@@ -840,6 +850,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_zextload_v1i1_to_v1i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_u8 s2, s[2:3], 0x0
@@ -917,6 +928,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_sextload_v1i1_to_v1i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_u8 s2, s[2:3], 0x0
@@ -1002,6 +1014,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_zextload_v2i1_to_v2i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1088,6 +1101,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_sextload_v2i1_to_v2i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_u8 s2, s[2:3], 0x0
@@ -1184,6 +1198,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_zextload_v3i1_to_v3i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1281,6 +1296,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_sextload_v3i1_to_v3i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_u8 s2, s[2:3], 0x0
@@ -1378,6 +1394,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_zextload_v4i1_to_v4i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1480,6 +1497,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_sextload_v4i1_to_v4i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_u8 s2, s[2:3], 0x0
@@ -1621,6 +1639,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_zextload_v8i1_to_v8i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1767,6 +1786,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_sextload_v8i1_to_v8i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_u8 s2, s[2:3], 0x0
@@ -1989,6 +2009,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
 ;
 ; GFX1250-LABEL: constant_zextload_v16i1_to_v16i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2222,6 +2243,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
 ;
 ; GFX1250-LABEL: constant_sextload_v16i1_to_v16i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_u16 s2, s[2:3], 0x0
@@ -2629,6 +2651,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ;
 ; GFX1250-LABEL: constant_zextload_v32i1_to_v32i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -3100,6 +3123,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
 ;
 ; GFX1250-LABEL: constant_sextload_v32i1_to_v32i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -3870,6 +3894,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ;
 ; GFX1250-LABEL: constant_zextload_v64i1_to_v64i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
@@ -4766,6 +4791,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
 ;
 ; GFX1250-LABEL: constant_sextload_v64i1_to_v64i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
@@ -4970,6 +4996,7 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt
 ;
 ; GFX1250-LABEL: constant_zextload_i1_to_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_u8 s2, s[2:3], 0x0
@@ -5052,6 +5079,7 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt
 ;
 ; GFX1250-LABEL: constant_sextload_i1_to_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -5134,6 +5162,7 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_zextload_v1i1_to_v1i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_u8 s2, s[2:3], 0x0
@@ -5216,6 +5245,7 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_sextload_v1i1_to_v1i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -5311,6 +5341,7 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_zextload_v2i1_to_v2i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -5412,6 +5443,7 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_sextload_v2i1_to_v2i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -5530,6 +5562,7 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_zextload_v3i1_to_v3i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -5661,6 +5694,7 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_sextload_v3i1_to_v3i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v5, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -5799,6 +5833,7 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_zextload_v4i1_to_v4i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -5948,6 +5983,7 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_sextload_v4i1_to_v4i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v9, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -6135,6 +6171,7 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_zextload_v8i1_to_v8i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -6367,6 +6404,7 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
 ;
 ; GFX1250-LABEL: constant_sextload_v8i1_to_v8i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v16, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -6695,6 +6733,7 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ;
 ; GFX1250-LABEL: constant_zextload_v16i1_to_v16i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -7100,6 +7139,7 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
 ;
 ; GFX1250-LABEL: constant_sextload_v16i1_to_v16i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v32, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -7698,6 +7738,7 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ;
 ; GFX1250-LABEL: constant_zextload_v32i1_to_v32i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -8495,6 +8536,7 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
 ;
 ; GFX1250-LABEL: constant_sextload_v32i1_to_v32i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -9657,6 +9699,7 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ;
 ; GFX1250-LABEL: constant_zextload_v64i1_to_v64i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
@@ -11206,6 +11249,7 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
 ;
 ; GFX1250-LABEL: constant_sextload_v64i1_to_v64i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b64 s[10:11], s[2:3], 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index 6f7ee70812264..8812dfd9ed280 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -87,6 +87,7 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
 ;
 ; GFX1250-LABEL: constant_load_i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -184,6 +185,7 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: constant_load_v2i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -293,6 +295,7 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: constant_load_v3i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b96 s[4:6], s[2:3], 0x0
@@ -400,6 +403,7 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: constant_load_v4i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -545,6 +549,7 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: constant_load_v8i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
@@ -724,6 +729,7 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: constant_load_v9i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
@@ -912,6 +918,7 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: constant_load_v10i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
@@ -1112,6 +1119,7 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: constant_load_v11i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
@@ -1310,6 +1318,7 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: constant_load_v12i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
@@ -1540,6 +1549,7 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: constant_load_v16i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[16:19], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
@@ -1648,6 +1658,7 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
 ;
 ; GFX1250-LABEL: constant_zextload_i32_to_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -1752,6 +1763,7 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
 ;
 ; GFX1250-LABEL: constant_sextload_i32_to_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -1851,6 +1863,7 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
 ;
 ; GFX1250-LABEL: constant_zextload_v1i32_to_v1i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -1955,6 +1968,7 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
 ;
 ; GFX1250-LABEL: constant_sextload_v1i32_to_v1i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x0
@@ -2067,6 +2081,7 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ;
 ; GFX1250-LABEL: constant_zextload_v2i32_to_v2i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
@@ -2192,6 +2207,7 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
 ;
 ; GFX1250-LABEL: constant_sextload_v2i32_to_v2i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
@@ -2338,6 +2354,7 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ;
 ; GFX1250-LABEL: constant_zextload_v4i32_to_v4i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
@@ -2515,6 +2532,7 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
 ;
 ; GFX1250-LABEL: constant_sextload_v4i32_to_v4i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
@@ -2735,6 +2753,7 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ;
 ; GFX1250-LABEL: constant_zextload_v8i32_to_v8i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[8:11], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b256 s[0:7], s[10:11], 0x0
@@ -3025,6 +3044,7 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
 ;
 ; GFX1250-LABEL: constant_sextload_v8i32_to_v8i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b256 s[4:11], s[2:3], 0x0
@@ -3532,6 +3552,7 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
 ;
 ; GFX1250-LABEL: constant_sextload_v16i32_to_v16i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[16:19], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
@@ -3928,6 +3949,7 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
 ;
 ; GFX1250-LABEL: constant_zextload_v16i32_to_v16i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[16:19], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b512 s[0:15], s[18:19], 0x0
@@ -4870,6 +4892,7 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
 ;
 ; GFX1250-LABEL: constant_sextload_v32i32_to_v32i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[36:39], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1
@@ -5617,6 +5640,7 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
 ;
 ; GFX1250-LABEL: constant_zextload_v32i32_to_v32i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[36:39], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b512 s[16:31], s[38:39], 0x40
@@ -6048,6 +6072,7 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
 ;
 ; GFX1250-LABEL: constant_load_v32i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[36:39], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_clause 0x1

diff  --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index 2ff69d234455f..5eb6bf6d8179a 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -66,6 +66,7 @@ define amdgpu_kernel void @copy_flat(ptr nocapture %d, ptr nocapture readonly %s
 ;
 ; GFX1250-LABEL: copy_flat:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_cmp_eq_u32 s6, 0
@@ -161,6 +162,7 @@ define amdgpu_kernel void @copy_global(ptr addrspace(1) nocapture %d, ptr addrsp
 ;
 ; GFX1250-LABEL: copy_global:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_cmp_eq_u32 s6, 0
@@ -258,6 +260,7 @@ define amdgpu_kernel void @copy_constant(ptr addrspace(1) nocapture %d, ptr addr
 ;
 ; GFX1250-LABEL: copy_constant:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s6, s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_cmp_eq_u32 s6, 0
@@ -355,6 +358,7 @@ define amdgpu_kernel void @copy_local(ptr addrspace(3) nocapture %d, ptr addrspa
 ;
 ; GFX1250-LABEL: copy_local:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_cmp_eq_u32 s2, 0
@@ -481,6 +485,7 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r
 ;
 ; GFX1250-LABEL: copy_flat_divergent:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_cmp_eq_u32 s2, 0
@@ -614,6 +619,7 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d,
 ;
 ; GFX1250-LABEL: copy_global_divergent:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_cmp_eq_u32 s0, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
index 6b6658bd672de..48363d627839b 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
@@ -49,6 +49,7 @@ define amdgpu_cs void @_amdgpu_cs_main() {
 ;
 ; GFX1250-SDAG-LABEL: _amdgpu_cs_main:
 ; GFX1250-SDAG:       ; %bb.0: ; %.entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
 ; GFX1250-SDAG-NEXT:    s_bfe_u32 s2, ttmp6, 0x40010
 ; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
@@ -79,6 +80,7 @@ define amdgpu_cs void @_amdgpu_cs_main() {
 ;
 ; GFX1250-GISEL-LABEL: _amdgpu_cs_main:
 ; GFX1250-GISEL:       ; %bb.0: ; %.entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
 ; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp6, 15
 ; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
@@ -164,6 +166,7 @@ define amdgpu_cs void @workgroup_id_no_clusters() "amdgpu-cluster-dims"="0,0,0"
 ;
 ; GFX1250-SDAG-LABEL: workgroup_id_no_clusters:
 ; GFX1250-SDAG:       ; %bb.0: ; %.entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
 ; GFX1250-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
@@ -173,6 +176,7 @@ define amdgpu_cs void @workgroup_id_no_clusters() "amdgpu-cluster-dims"="0,0,0"
 ;
 ; GFX1250-GISEL-LABEL: workgroup_id_no_clusters:
 ; GFX1250-GISEL:       ; %bb.0: ; %.entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s0, ttmp9
 ; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
 ; GFX1250-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
@@ -234,6 +238,7 @@ define amdgpu_cs void @workgroup_id_optimized() "amdgpu-cluster-dims"="2,3,4" {
 ;
 ; GFX1250-SDAG-LABEL: workgroup_id_optimized:
 ; GFX1250-SDAG:       ; %bb.0: ; %.entry
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 14
 ; GFX1250-SDAG-NEXT:    s_and_b32 s2, ttmp7, 0xffff
 ; GFX1250-SDAG-NEXT:    s_and_b32 s0, ttmp6, 15
@@ -252,6 +257,7 @@ define amdgpu_cs void @workgroup_id_optimized() "amdgpu-cluster-dims"="2,3,4" {
 ;
 ; GFX1250-GISEL-LABEL: workgroup_id_optimized:
 ; GFX1250-GISEL:       ; %bb.0: ; %.entry
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
 ; GFX1250-GISEL-NEXT:    s_and_b32 s0, ttmp6, 15
 ; GFX1250-GISEL-NEXT:    s_bfe_u32 s2, ttmp6, 0x40004
@@ -333,12 +339,14 @@ define amdgpu_cs void @caller() {
 ;
 ; GFX1250-SDAG-LABEL: caller:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
 ; GFX1250-SDAG-NEXT:    s_and_b32 s1, ttmp6, 15
 ; GFX1250-SDAG-NEXT:    s_add_co_i32 s0, s0, 1
 ; GFX1250-SDAG-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 6, 4)
 ; GFX1250-SDAG-NEXT:    s_mul_i32 s0, ttmp9, s0
-; GFX1250-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX1250-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-SDAG-NEXT:    s_add_co_i32 s1, s1, s0
 ; GFX1250-SDAG-NEXT:    s_cmp_eq_u32 s2, 0
 ; GFX1250-SDAG-NEXT:    s_cselect_b32 s2, ttmp9, s1
@@ -349,12 +357,14 @@ define amdgpu_cs void @caller() {
 ;
 ; GFX1250-GISEL-LABEL: caller:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
 ; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp6, 15
 ; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1
 ; GFX1250-GISEL-NEXT:    s_getreg_b32 s2, hwreg(HW_REG_IB_STS2, 6, 4)
 ; GFX1250-GISEL-NEXT:    s_mul_i32 s0, ttmp9, s0
-; GFX1250-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX1250-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-NEXT:    s_add_co_i32 s1, s1, s0
 ; GFX1250-GISEL-NEXT:    s_cmp_eq_u32 s2, 0
 ; GFX1250-GISEL-NEXT:    s_cselect_b32 s2, ttmp9, s1

diff  --git a/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
index c96ba754c0811..83625a59ed69f 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-bf16.ll
@@ -595,6 +595,7 @@ define float @v_mad_mix_f32_bf16lo_cast_from_half_bf16lo_bf16lo(half %src0, bflo
 define amdgpu_kernel void @test_fma_mix_f32_bf16_src2_bf16lo(float %x, i32 %y, ptr addrspace(1) %out) {
 ; GFX1250-LABEL: test_fma_mix_f32_bf16_src2_bf16lo:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_fma_mix_f32_bf16 v0, s0, 0, s1 op_sel_hi:[0,0,1]

diff  --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
index 87d52684e588c..66e3f9866268e 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll
@@ -1118,6 +1118,7 @@ define amdgpu_kernel void @mad_i64_i32_uniform(ptr addrspace(1) %out, i32 %arg0,
 ;
 ; GFX1250-LABEL: mad_i64_i32_uniform:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
@@ -2226,6 +2227,7 @@ define amdgpu_ps i64 @lshr_mad_i64_sgpr(i64 inreg %arg0) #0 {
 ;
 ; GFX1250-LABEL: lshr_mad_i64_sgpr:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mov_b32 s3, 0
 ; GFX1250-NEXT:    s_mov_b32 s2, s1
 ; GFX1250-NEXT:    s_mov_b64 s[4:5], 0xffffffffffff1c18

diff  --git a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
index 1112be3aeac07..709679cb19184 100644
--- a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll
@@ -26,6 +26,7 @@ define amdgpu_ps float @mad_i32_vvv(i32 %a, i32 %b, i32 %c) {
 ;
 ; GFX1250-LABEL: mad_i32_vvv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mad_u32 v0, v0, v1, v2
 ; GFX1250-NEXT:    ; return to shader part epilog
   %mul = mul i32 %a, %b
@@ -58,6 +59,7 @@ define amdgpu_ps float @mad_i32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
 ;
 ; GFX1250-LABEL: mad_i32_sss:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_mul_i32 s0, s0, s1
 ; GFX1250-NEXT:    s_add_co_i32 s0, s0, s2
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
@@ -88,6 +90,7 @@ define amdgpu_ps float @mad_i32_vvc(i32 %a, i32 %b) {
 ;
 ; GFX1250-LABEL: mad_i32_vvc:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mad_u32 v0, v0, v1, 42
 ; GFX1250-NEXT:    ; return to shader part epilog
   %mul = mul i32 %a, %b
@@ -118,6 +121,7 @@ define amdgpu_ps float @mad_i32_vvi(i32 %a, i32 %b) {
 ;
 ; GFX1250-LABEL: mad_i32_vvi:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mad_u32 v0, v0, v1, 0x12d687
 ; GFX1250-NEXT:    ; return to shader part epilog
   %mul = mul i32 %a, %b
@@ -148,6 +152,7 @@ define amdgpu_ps float @mad_i32_vvi_neg(i32 %a, i32 %b) {
 ;
 ; GFX1250-LABEL: mad_i32_vvi_neg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mad_u32 v0, v0, v1, 0xffed2979
 ; GFX1250-NEXT:    ; return to shader part epilog
   %mul = mul i32 %a, %b
@@ -175,6 +180,7 @@ define amdgpu_ps float @mad_i32_vcv(i32 %a, i32 %c) {
 ;
 ; GFX1250-LABEL: mad_i32_vcv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mad_u32 v0, v0, 42, v1
 ; GFX1250-NEXT:    ; return to shader part epilog
   %mul = mul i32 %a, 42
@@ -202,6 +208,7 @@ define amdgpu_ps float @mad_i32_vcc(i32 %a) {
 ;
 ; GFX1250-LABEL: mad_i32_vcc:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mad_u32 v0, v0, 42, 43
 ; GFX1250-NEXT:    ; return to shader part epilog
   %mul = mul i32 %a, 42
@@ -230,6 +237,7 @@ define amdgpu_ps float @mad_i32_vvs(i32 %a, i32 %b, i32 inreg %c) {
 ;
 ; GFX1250-LABEL: mad_i32_vvs:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mad_u32 v0, v0, v1, s0
 ; GFX1250-NEXT:    ; return to shader part epilog
   %mul = mul i32 %a, %b
@@ -257,6 +265,7 @@ define amdgpu_ps float @mad_i32_vsv(i32 %a, i32 inreg %b, i32 %c) {
 ;
 ; GFX1250-LABEL: mad_i32_vsv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mad_u32 v0, v0, s0, v1
 ; GFX1250-NEXT:    ; return to shader part epilog
   %mul = mul i32 %a, %b
@@ -284,6 +293,7 @@ define amdgpu_ps float @mad_i32_svv(i32 inreg %a, i32 %b, i32 %c) {
 ;
 ; GFX1250-LABEL: mad_i32_svv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mad_u32 v0, s0, v0, v1
 ; GFX1250-NEXT:    ; return to shader part epilog
   %mul = mul i32 %a, %b
@@ -314,6 +324,7 @@ define amdgpu_ps float @mad_i32_vss(i32 %a, i32 inreg %b, i32 inreg %c) {
 ;
 ; GFX1250-LABEL: mad_i32_vss:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mad_u32 v0, v0, s0, s1
 ; GFX1250-NEXT:    ; return to shader part epilog
   %mul = mul i32 %a, %b
@@ -344,6 +355,7 @@ define amdgpu_ps float @mad_i32_svs(i32 inreg %a, i32 %b, i32 inreg %c) {
 ;
 ; GFX1250-LABEL: mad_i32_svs:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mad_u32 v0, s0, v0, s1
 ; GFX1250-NEXT:    ; return to shader part epilog
   %mul = mul i32 %a, %b
@@ -372,6 +384,7 @@ define amdgpu_ps float @mad_i32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
 ;
 ; GFX1250-LABEL: mad_i32_ssv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mad_u32 v0, s0, s1, v0
 ; GFX1250-NEXT:    ; return to shader part epilog
   %mul = mul i32 %a, %b
@@ -407,6 +420,7 @@ define amdgpu_ps float @mad_i32_vvv_multiuse(i32 %a, i32 %b, i32 %c) {
 ;
 ; GFX1250-LABEL: mad_i32_vvv_multiuse:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX1250-NEXT:    v_add_nc_u32_e32 v0, v1, v2
 ; GFX1250-NEXT:    flat_store_b32 v[0:1], v1

diff  --git a/llvm/test/CodeGen/AMDGPU/max.ll b/llvm/test/CodeGen/AMDGPU/max.ll
index c48e25f36e99f..ac6dd30283554 100644
--- a/llvm/test/CodeGen/AMDGPU/max.ll
+++ b/llvm/test/CodeGen/AMDGPU/max.ll
@@ -27,6 +27,7 @@ define amdgpu_kernel void @v_test_imax_sge_i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: v_test_imax_sge_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
@@ -100,6 +101,7 @@ define amdgpu_kernel void @v_test_imax_sge_v4i32(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX1250-LABEL: v_test_imax_sge_v4i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    v_mov_b32_e32 v4, 0
@@ -166,6 +168,7 @@ define amdgpu_kernel void @s_test_imax_sge_i32(ptr addrspace(1) %out, i32 %a, i3
 ;
 ; GFX1250-LABEL: s_test_imax_sge_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -206,6 +209,7 @@ define amdgpu_kernel void @s_test_imax_sge_imm_i32(ptr addrspace(1) %out, i32 %a
 ;
 ; GFX1250-LABEL: s_test_imax_sge_imm_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -256,6 +260,7 @@ define amdgpu_kernel void @v_test_imax_sge_i8(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX1250-LABEL: v_test_imax_sge_i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
@@ -327,6 +332,7 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_i32(ptr addrspace(1) %out, i32 %a
 ;
 ; GFX1250-LABEL: s_test_imax_sgt_imm_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -369,6 +375,7 @@ define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(ptr addrspace(1) %out, <2 x
 ;
 ; GFX1250-LABEL: s_test_imax_sgt_imm_v2i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -421,6 +428,7 @@ define amdgpu_kernel void @v_test_imax_sgt_i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: v_test_imax_sgt_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
@@ -481,6 +489,7 @@ define amdgpu_kernel void @s_test_imax_sgt_i32(ptr addrspace(1) %out, i32 %a, i3
 ;
 ; GFX1250-LABEL: s_test_imax_sgt_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -530,6 +539,7 @@ define amdgpu_kernel void @v_test_umax_uge_i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: v_test_umax_uge_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
@@ -590,6 +600,7 @@ define amdgpu_kernel void @s_test_umax_uge_i32(ptr addrspace(1) %out, i32 %a, i3
 ;
 ; GFX1250-LABEL: s_test_umax_uge_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -636,6 +647,7 @@ define amdgpu_kernel void @s_test_umax_uge_v3i32(ptr addrspace(1) %out, <3 x i32
 ;
 ; GFX1250-LABEL: s_test_umax_uge_v3i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x34
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
@@ -696,6 +708,7 @@ define amdgpu_kernel void @v_test_umax_uge_i8(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX1250-LABEL: v_test_umax_uge_i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
@@ -766,6 +779,7 @@ define amdgpu_kernel void @v_test_umax_ugt_i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: v_test_umax_ugt_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
@@ -824,6 +838,7 @@ define amdgpu_kernel void @s_test_umax_ugt_i32(ptr addrspace(1) %out, i32 %a, i3
 ;
 ; GFX1250-LABEL: s_test_umax_ugt_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -866,6 +881,7 @@ define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(ptr addrspace(1) %out, <2 x
 ;
 ; GFX1250-LABEL: s_test_umax_ugt_imm_v2i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -914,6 +930,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(ptr addrspac
 ;
 ; GFX1250-LABEL: simplify_demanded_bits_test_umax_ugt_i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x2
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4c
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x70
@@ -977,6 +994,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(ptr addrspace
 ;
 ; GFX1250-LABEL: simplify_demanded_bits_test_max_slt_i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x2
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4c
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x70
@@ -1039,6 +1057,7 @@ define amdgpu_kernel void @s_test_imax_sge_i16(ptr addrspace(1) %out, [8 x i32],
 ;
 ; GFX1250-LABEL: s_test_imax_sge_i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x2
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x70
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x4c
@@ -1113,6 +1132,7 @@ define amdgpu_kernel void @test_umax_ugt_i64(ptr addrspace(1) %out, i64 %a, i64
 ;
 ; GFX1250-LABEL: test_umax_ugt_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
@@ -1166,6 +1186,7 @@ define amdgpu_kernel void @test_umax_uge_i64(ptr addrspace(1) %out, i64 %a, i64
 ;
 ; GFX1250-LABEL: test_umax_uge_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
@@ -1219,6 +1240,7 @@ define amdgpu_kernel void @test_imax_sgt_i64(ptr addrspace(1) %out, i64 %a, i64
 ;
 ; GFX1250-LABEL: test_imax_sgt_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
@@ -1272,6 +1294,7 @@ define amdgpu_kernel void @test_imax_sge_i64(ptr addrspace(1) %out, i64 %a, i64
 ;
 ; GFX1250-LABEL: test_imax_sge_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
index 282a7ae7ea2fd..434e71295aad2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-barriers.ll
@@ -42,6 +42,7 @@ define amdgpu_kernel void @test_s_barrier() {
 ;
 ; GFX1250-LABEL: test_s_barrier:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_barrier_signal -1
 ; GFX1250-NEXT:    s_barrier_wait -1
 ; GFX1250-NEXT:    s_endpgm
@@ -101,6 +102,7 @@ define amdgpu_kernel void @test_s_barrier_workgroup_fence() {
 ;
 ; GFX1250-LABEL: test_s_barrier_workgroup_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_barrier_signal -1
@@ -163,6 +165,7 @@ define amdgpu_kernel void @test_s_barrier_agent_fence() {
 ;
 ; GFX1250-LABEL: test_s_barrier_agent_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
index 7efbff9c637c5..b07807aafbf1d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-global.ll
@@ -82,6 +82,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
 ;
 ; GFX1250-LABEL: workgroup_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -163,6 +164,7 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX1250-LABEL: workgroup_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -249,6 +251,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: workgroup_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -335,6 +338,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: workgroup_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -411,6 +415,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
 ;
 ; GFX1250-LABEL: workgroup_one_as_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -492,6 +497,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX1250-LABEL: workgroup_one_as_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -578,6 +584,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -664,6 +671,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -762,6 +770,7 @@ define amdgpu_kernel void @agent_acquire_fence() {
 ;
 ; GFX1250-LABEL: agent_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
@@ -851,6 +860,7 @@ define amdgpu_kernel void @agent_release_fence() {
 ;
 ; GFX1250-LABEL: agent_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -956,6 +966,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: agent_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -1062,6 +1073,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: agent_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -1162,6 +1174,7 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
@@ -1251,6 +1264,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -1356,6 +1370,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -1462,6 +1477,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -1564,6 +1580,7 @@ define amdgpu_kernel void @system_acquire_fence() {
 ;
 ; GFX1250-LABEL: system_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
@@ -1657,6 +1674,7 @@ define amdgpu_kernel void @system_release_fence() {
 ;
 ; GFX1250-LABEL: system_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -1768,6 +1786,7 @@ define amdgpu_kernel void @system_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: system_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -1880,6 +1899,7 @@ define amdgpu_kernel void @system_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: system_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -1982,6 +2002,7 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
 ;
 ; GFX1250-LABEL: system_one_as_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
@@ -2075,6 +2096,7 @@ define amdgpu_kernel void @system_one_as_release_fence() {
 ;
 ; GFX1250-LABEL: system_one_as_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -2186,6 +2208,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: system_one_as_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -2298,6 +2321,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: system_one_as_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
index 8b0b099999f06..022ec1d601bf5 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence-mmra-local.ll
@@ -80,6 +80,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
 ;
 ; GFX1250-LABEL: workgroup_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -153,6 +154,7 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX1250-LABEL: workgroup_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -226,6 +228,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: workgroup_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -299,6 +302,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: workgroup_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -361,6 +365,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
 ;
 ; GFX1250-LABEL: workgroup_one_as_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("workgroup-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -422,6 +427,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX1250-LABEL: workgroup_one_as_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("workgroup-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -483,6 +489,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("workgroup-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -544,6 +551,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("workgroup-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -616,6 +624,7 @@ define amdgpu_kernel void @agent_acquire_fence() {
 ;
 ; GFX1250-LABEL: agent_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -689,6 +698,7 @@ define amdgpu_kernel void @agent_release_fence() {
 ;
 ; GFX1250-LABEL: agent_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -762,6 +772,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: agent_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -835,6 +846,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: agent_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -897,6 +909,7 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -958,6 +971,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1019,6 +1033,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1080,6 +1095,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("agent-one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1152,6 +1168,7 @@ define amdgpu_kernel void @system_acquire_fence() {
 ;
 ; GFX1250-LABEL: system_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -1225,6 +1242,7 @@ define amdgpu_kernel void @system_release_fence() {
 ;
 ; GFX1250-LABEL: system_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -1298,6 +1316,7 @@ define amdgpu_kernel void @system_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: system_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -1371,6 +1390,7 @@ define amdgpu_kernel void @system_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: system_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
 entry:
@@ -1433,6 +1453,7 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
 ;
 ; GFX1250-LABEL: system_one_as_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") acquire, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1494,6 +1515,7 @@ define amdgpu_kernel void @system_one_as_release_fence() {
 ;
 ; GFX1250-LABEL: system_one_as_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") release, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1555,6 +1577,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: system_one_as_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") acq_rel, !mmra !{!"amdgpu-synchronize-as", !"local"}
@@ -1616,6 +1639,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: system_one_as_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("one-as") seq_cst, !mmra !{!"amdgpu-synchronize-as", !"local"}

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index 1cca64ad6d2b4..127788e5a4713 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -69,6 +69,7 @@ define amdgpu_kernel void @singlethread_acquire_fence() {
 ;
 ; GFX1250-LABEL: singlethread_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("singlethread") acquire
@@ -130,6 +131,7 @@ define amdgpu_kernel void @singlethread_release_fence() {
 ;
 ; GFX1250-LABEL: singlethread_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("singlethread") release
@@ -191,6 +193,7 @@ define amdgpu_kernel void @singlethread_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: singlethread_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("singlethread") acq_rel
@@ -252,6 +255,7 @@ define amdgpu_kernel void @singlethread_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: singlethread_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("singlethread") seq_cst
@@ -313,6 +317,7 @@ define amdgpu_kernel void @singlethread_one_as_acquire_fence() {
 ;
 ; GFX1250-LABEL: singlethread_one_as_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("singlethread-one-as") acquire
@@ -374,6 +379,7 @@ define amdgpu_kernel void @singlethread_one_as_release_fence() {
 ;
 ; GFX1250-LABEL: singlethread_one_as_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("singlethread-one-as") release
@@ -435,6 +441,7 @@ define amdgpu_kernel void @singlethread_one_as_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: singlethread_one_as_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("singlethread-one-as") acq_rel
@@ -496,6 +503,7 @@ define amdgpu_kernel void @singlethread_one_as_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: singlethread_one_as_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("singlethread-one-as") seq_cst
@@ -557,6 +565,7 @@ define amdgpu_kernel void @wavefront_acquire_fence() {
 ;
 ; GFX1250-LABEL: wavefront_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("wavefront") acquire
@@ -618,6 +627,7 @@ define amdgpu_kernel void @wavefront_release_fence() {
 ;
 ; GFX1250-LABEL: wavefront_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("wavefront") release
@@ -679,6 +689,7 @@ define amdgpu_kernel void @wavefront_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: wavefront_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("wavefront") acq_rel
@@ -740,6 +751,7 @@ define amdgpu_kernel void @wavefront_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: wavefront_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("wavefront") seq_cst
@@ -801,6 +813,7 @@ define amdgpu_kernel void @wavefront_one_as_acquire_fence() {
 ;
 ; GFX1250-LABEL: wavefront_one_as_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("wavefront-one-as") acquire
@@ -862,6 +875,7 @@ define amdgpu_kernel void @wavefront_one_as_release_fence() {
 ;
 ; GFX1250-LABEL: wavefront_one_as_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("wavefront-one-as") release
@@ -923,6 +937,7 @@ define amdgpu_kernel void @wavefront_one_as_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: wavefront_one_as_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("wavefront-one-as") acq_rel
@@ -984,6 +999,7 @@ define amdgpu_kernel void @wavefront_one_as_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: wavefront_one_as_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_endpgm
 entry:
   fence syncscope("wavefront-one-as") seq_cst
@@ -1066,6 +1082,7 @@ define amdgpu_kernel void @workgroup_acquire_fence() {
 ;
 ; GFX1250-LABEL: workgroup_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -1152,6 +1169,7 @@ define amdgpu_kernel void @workgroup_release_fence() {
 ;
 ; GFX1250-LABEL: workgroup_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -1243,6 +1261,7 @@ define amdgpu_kernel void @workgroup_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: workgroup_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -1334,6 +1353,7 @@ define amdgpu_kernel void @workgroup_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: workgroup_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -1410,6 +1430,7 @@ define amdgpu_kernel void @workgroup_one_as_acquire_fence() {
 ;
 ; GFX1250-LABEL: workgroup_one_as_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -1491,6 +1512,7 @@ define amdgpu_kernel void @workgroup_one_as_release_fence() {
 ;
 ; GFX1250-LABEL: workgroup_one_as_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -1577,6 +1599,7 @@ define amdgpu_kernel void @workgroup_one_as_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: workgroup_one_as_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -1663,6 +1686,7 @@ define amdgpu_kernel void @workgroup_one_as_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: workgroup_one_as_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -1762,6 +1786,7 @@ define amdgpu_kernel void @cluster_acquire_fence() {
 ;
 ; GFX1250-LABEL: cluster_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_SE
@@ -1851,6 +1876,7 @@ define amdgpu_kernel void @cluster_release_fence() {
 ;
 ; GFX1250-LABEL: cluster_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -1955,6 +1981,7 @@ define amdgpu_kernel void @cluster_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: cluster_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_SE
@@ -2060,6 +2087,7 @@ define amdgpu_kernel void @cluster_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: cluster_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_SE
@@ -2159,6 +2187,7 @@ define amdgpu_kernel void @cluster_one_as_acquire_fence() {
 ;
 ; GFX1250-LABEL: cluster_one_as_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_SE
@@ -2248,6 +2277,7 @@ define amdgpu_kernel void @cluster_one_as_release_fence() {
 ;
 ; GFX1250-LABEL: cluster_one_as_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_endpgm
@@ -2352,6 +2382,7 @@ define amdgpu_kernel void @cluster_one_as_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: cluster_one_as_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_SE
@@ -2457,6 +2488,7 @@ define amdgpu_kernel void @cluster_one_as_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: cluster_one_as_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_SE
@@ -2556,6 +2588,7 @@ define amdgpu_kernel void @agent_acquire_fence() {
 ;
 ; GFX1250-LABEL: agent_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
@@ -2645,6 +2678,7 @@ define amdgpu_kernel void @agent_release_fence() {
 ;
 ; GFX1250-LABEL: agent_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2750,6 +2784,7 @@ define amdgpu_kernel void @agent_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: agent_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2856,6 +2891,7 @@ define amdgpu_kernel void @agent_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: agent_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -2956,6 +2992,7 @@ define amdgpu_kernel void @agent_one_as_acquire_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_DEV
@@ -3045,6 +3082,7 @@ define amdgpu_kernel void @agent_one_as_release_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -3150,6 +3188,7 @@ define amdgpu_kernel void @agent_one_as_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -3256,6 +3295,7 @@ define amdgpu_kernel void @agent_one_as_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: agent_one_as_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_DEV
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -3358,6 +3398,7 @@ define amdgpu_kernel void @system_acquire_fence() {
 ;
 ; GFX1250-LABEL: system_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
@@ -3451,6 +3492,7 @@ define amdgpu_kernel void @system_release_fence() {
 ;
 ; GFX1250-LABEL: system_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -3562,6 +3604,7 @@ define amdgpu_kernel void @system_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: system_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -3674,6 +3717,7 @@ define amdgpu_kernel void @system_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: system_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
@@ -3776,6 +3820,7 @@ define amdgpu_kernel void @system_one_as_acquire_fence() {
 ;
 ; GFX1250-LABEL: system_one_as_acquire_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
 ; GFX1250-NEXT:    global_inv scope:SCOPE_SYS
@@ -3869,6 +3914,7 @@ define amdgpu_kernel void @system_one_as_release_fence() {
 ;
 ; GFX1250-LABEL: system_one_as_release_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -3980,6 +4026,7 @@ define amdgpu_kernel void @system_one_as_acq_rel_fence() {
 ;
 ; GFX1250-LABEL: system_one_as_acq_rel_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0
@@ -4092,6 +4139,7 @@ define amdgpu_kernel void @system_one_as_seq_cst_fence() {
 ;
 ; GFX1250-LABEL: system_one_as_seq_cst_fence:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    global_wb scope:SCOPE_SYS
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
 ; GFX1250-NEXT:    s_wait_storecnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index bf516f8b91c91..01c60243c5163 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -191,6 +191,7 @@ define amdgpu_kernel void @flat_agent_unordered_load(
 ;
 ; GFX1250-LABEL: flat_agent_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -384,6 +385,7 @@ define amdgpu_kernel void @flat_agent_monotonic_load(
 ;
 ; GFX1250-LABEL: flat_agent_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -592,6 +594,7 @@ define amdgpu_kernel void @flat_agent_acquire_load(
 ;
 ; GFX1250-LABEL: flat_agent_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -823,6 +826,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_load(
 ;
 ; GFX1250-LABEL: flat_agent_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -987,6 +991,7 @@ define amdgpu_kernel void @flat_agent_unordered_store(
 ;
 ; GFX1250-LABEL: flat_agent_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1147,6 +1152,7 @@ define amdgpu_kernel void @flat_agent_monotonic_store(
 ;
 ; GFX1250-LABEL: flat_agent_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1331,6 +1337,7 @@ define amdgpu_kernel void @flat_agent_release_store(
 ;
 ; GFX1250-LABEL: flat_agent_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1518,6 +1525,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_store(
 ;
 ; GFX1250-LABEL: flat_agent_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1681,6 +1689,7 @@ define amdgpu_kernel void @flat_agent_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_agent_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -1872,6 +1881,7 @@ define amdgpu_kernel void @flat_agent_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_agent_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2058,6 +2068,7 @@ define amdgpu_kernel void @flat_agent_release_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_agent_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2276,6 +2287,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_agent_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2496,6 +2508,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_agent_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2720,6 +2733,7 @@ define amdgpu_kernel void @flat_agent_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_agent_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2967,6 +2981,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_agent_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3217,6 +3232,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_agent_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3472,6 +3488,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3756,6 +3773,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4035,6 +4053,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4346,6 +4365,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4659,6 +4679,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4948,6 +4969,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5234,6 +5256,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5544,6 +5567,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5857,6 +5881,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6170,6 +6195,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6483,6 +6509,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6796,6 +6823,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7109,6 +7137,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7422,6 +7451,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7735,6 +7765,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8037,6 +8068,7 @@ define amdgpu_kernel void @flat_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8353,6 +8385,7 @@ define amdgpu_kernel void @flat_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8679,6 +8712,7 @@ define amdgpu_kernel void @flat_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9022,6 +9056,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9366,6 +9401,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9686,6 +9722,7 @@ define amdgpu_kernel void @flat_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10003,6 +10040,7 @@ define amdgpu_kernel void @flat_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10344,6 +10382,7 @@ define amdgpu_kernel void @flat_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10688,6 +10727,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11032,6 +11072,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11376,6 +11417,7 @@ define amdgpu_kernel void @flat_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11720,6 +11762,7 @@ define amdgpu_kernel void @flat_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -12064,6 +12107,7 @@ define amdgpu_kernel void @flat_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -12408,6 +12452,7 @@ define amdgpu_kernel void @flat_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -12752,6 +12797,7 @@ define amdgpu_kernel void @flat_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -12957,6 +13003,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -13150,6 +13197,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -13368,6 +13416,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -13610,6 +13659,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -13775,6 +13825,7 @@ define amdgpu_kernel void @flat_agent_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -13935,6 +13986,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -14119,6 +14171,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_store(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -14306,6 +14359,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -14469,6 +14523,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14656,6 +14711,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14842,6 +14898,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15056,6 +15113,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15272,6 +15330,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15506,6 +15565,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15764,6 +15824,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -16025,6 +16086,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -16281,6 +16343,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16561,6 +16624,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16840,6 +16904,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17147,6 +17212,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17456,6 +17522,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17741,6 +17808,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18023,6 +18091,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18329,6 +18398,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18638,6 +18708,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18947,6 +19018,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19256,6 +19328,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19565,6 +19638,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19874,6 +19948,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20183,6 +20258,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20492,6 +20568,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20794,6 +20871,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21120,6 +21198,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21447,6 +21526,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21800,6 +21880,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22155,6 +22236,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22486,6 +22568,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22814,6 +22897,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -23166,6 +23250,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -23521,6 +23606,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -23876,6 +23962,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -24231,6 +24318,7 @@ define amdgpu_kernel void @flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -24586,6 +24674,7 @@ define amdgpu_kernel void @flat_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -24941,6 +25030,7 @@ define amdgpu_kernel void @flat_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -25296,6 +25386,7 @@ define amdgpu_kernel void @flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -25651,6 +25742,7 @@ define amdgpu_kernel void @flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll
index b2b71c246c97b..c4d1bb8de3d8a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-cluster.ll
@@ -191,6 +191,7 @@ define amdgpu_kernel void @flat_cluster_unordered_load(
 ;
 ; GFX1250-LABEL: flat_cluster_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -384,6 +385,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_load(
 ;
 ; GFX1250-LABEL: flat_cluster_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -592,6 +594,7 @@ define amdgpu_kernel void @flat_cluster_acquire_load(
 ;
 ; GFX1250-LABEL: flat_cluster_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -823,6 +826,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_load(
 ;
 ; GFX1250-LABEL: flat_cluster_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -987,6 +991,7 @@ define amdgpu_kernel void @flat_cluster_unordered_store(
 ;
 ; GFX1250-LABEL: flat_cluster_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1147,6 +1152,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_store(
 ;
 ; GFX1250-LABEL: flat_cluster_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1331,6 +1337,7 @@ define amdgpu_kernel void @flat_cluster_release_store(
 ;
 ; GFX1250-LABEL: flat_cluster_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1517,6 +1524,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_store(
 ;
 ; GFX1250-LABEL: flat_cluster_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1679,6 +1687,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_cluster_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -1870,6 +1879,7 @@ define amdgpu_kernel void @flat_cluster_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_cluster_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2056,6 +2066,7 @@ define amdgpu_kernel void @flat_cluster_release_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_cluster_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2273,6 +2284,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_cluster_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2492,6 +2504,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_cluster_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2715,6 +2728,7 @@ define amdgpu_kernel void @flat_cluster_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_cluster_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2962,6 +2976,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_cluster_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3211,6 +3226,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_cluster_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3465,6 +3481,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3749,6 +3766,7 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4028,6 +4046,7 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4338,6 +4357,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4650,6 +4670,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4938,6 +4959,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5224,6 +5246,7 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5534,6 +5557,7 @@ define amdgpu_kernel void @flat_cluster_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5846,6 +5870,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6158,6 +6183,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6470,6 +6496,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6782,6 +6809,7 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7094,6 +7122,7 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7406,6 +7435,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7718,6 +7748,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8019,6 +8050,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8335,6 +8367,7 @@ define amdgpu_kernel void @flat_cluster_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8661,6 +8694,7 @@ define amdgpu_kernel void @flat_cluster_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9003,6 +9037,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9346,6 +9381,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9665,6 +9701,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9982,6 +10019,7 @@ define amdgpu_kernel void @flat_cluster_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10323,6 +10361,7 @@ define amdgpu_kernel void @flat_cluster_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10666,6 +10705,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11009,6 +11049,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11352,6 +11393,7 @@ define amdgpu_kernel void @flat_cluster_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11695,6 +11737,7 @@ define amdgpu_kernel void @flat_cluster_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -12038,6 +12081,7 @@ define amdgpu_kernel void @flat_cluster_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -12381,6 +12425,7 @@ define amdgpu_kernel void @flat_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -12724,6 +12769,7 @@ define amdgpu_kernel void @flat_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -12928,6 +12974,7 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -13121,6 +13168,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -13339,6 +13387,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -13581,6 +13630,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -13746,6 +13796,7 @@ define amdgpu_kernel void @flat_cluster_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -13906,6 +13957,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -14090,6 +14142,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_store(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -14276,6 +14329,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -14438,6 +14492,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14625,6 +14680,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14811,6 +14867,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15024,6 +15081,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15239,6 +15297,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15472,6 +15531,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15730,6 +15790,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15990,6 +16051,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -16245,6 +16307,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16525,6 +16588,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16804,6 +16868,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17110,6 +17175,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17418,6 +17484,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17702,6 +17769,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17984,6 +18052,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18290,6 +18359,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18598,6 +18668,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18906,6 +18977,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19214,6 +19286,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19522,6 +19595,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19830,6 +19904,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20138,6 +20213,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20446,6 +20522,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20747,6 +20824,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21073,6 +21151,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21400,6 +21479,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21752,6 +21832,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22106,6 +22187,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22436,6 +22518,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22764,6 +22847,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -23116,6 +23200,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -23470,6 +23555,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -23824,6 +23910,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -24178,6 +24265,7 @@ define amdgpu_kernel void @flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -24532,6 +24620,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -24886,6 +24975,7 @@ define amdgpu_kernel void @flat_cluster_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -25240,6 +25330,7 @@ define amdgpu_kernel void @flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -25594,6 +25685,7 @@ define amdgpu_kernel void @flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
index bdde7c0975425..5de7d4325c8c8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-lastuse.ll
@@ -20,6 +20,7 @@ define amdgpu_kernel void @flat_last_use_load_0(ptr %in, ptr %out) {
 ;
 ; GFX1250-LABEL: flat_last_use_load_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -68,6 +69,7 @@ define amdgpu_kernel void @flat_last_use_load_1(ptr %in, ptr %out) {
 ;
 ; GFX1250-LABEL: flat_last_use_load_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -108,6 +110,7 @@ define amdgpu_kernel void @flat_last_use_and_volatile_load(ptr %in, ptr %out) {
 ;
 ; GFX1250-LABEL: flat_last_use_and_volatile_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -139,6 +142,7 @@ define amdgpu_kernel void @flat_last_use_and_nontemporal_load(ptr %in, ptr %out)
 ;
 ; GFX1250-LABEL: flat_last_use_and_nontemporal_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index a2a8ce75d7fb4..c4bd4e1cf4337 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -191,6 +191,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX1250-LABEL: flat_nontemporal_load_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -546,6 +547,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX1250-LABEL: flat_nontemporal_load_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -745,6 +747,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX1250-LABEL: flat_nontemporal_store_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -1088,6 +1091,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX1250-LABEL: flat_nontemporal_store_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -1300,6 +1304,7 @@ define amdgpu_kernel void @flat_nontemporal_volatile_load(
 ;
 ; GFX1250-LABEL: flat_nontemporal_volatile_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 7d357922ac307..aa314f74eb74c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -191,6 +191,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_load(
 ;
 ; GFX1250-LABEL: flat_singlethread_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -384,6 +385,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_load(
 ;
 ; GFX1250-LABEL: flat_singlethread_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -577,6 +579,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_load(
 ;
 ; GFX1250-LABEL: flat_singlethread_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -770,6 +773,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_load(
 ;
 ; GFX1250-LABEL: flat_singlethread_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -931,6 +935,7 @@ define amdgpu_kernel void @flat_singlethread_unordered_store(
 ;
 ; GFX1250-LABEL: flat_singlethread_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1091,6 +1096,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_store(
 ;
 ; GFX1250-LABEL: flat_singlethread_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1251,6 +1257,7 @@ define amdgpu_kernel void @flat_singlethread_release_store(
 ;
 ; GFX1250-LABEL: flat_singlethread_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1411,6 +1418,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_store(
 ;
 ; GFX1250-LABEL: flat_singlethread_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1571,6 +1579,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_singlethread_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -1731,6 +1740,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_singlethread_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -1891,6 +1901,7 @@ define amdgpu_kernel void @flat_singlethread_release_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_singlethread_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2051,6 +2062,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_singlethread_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2211,6 +2223,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_singlethread_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2415,6 +2428,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_singlethread_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2622,6 +2636,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_singlethread_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2829,6 +2844,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_singlethread_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3080,6 +3096,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3333,6 +3350,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3586,6 +3604,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3839,6 +3858,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4092,6 +4112,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4345,6 +4366,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4598,6 +4620,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4851,6 +4874,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5104,6 +5128,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5357,6 +5382,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5610,6 +5636,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5863,6 +5890,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6116,6 +6144,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6369,6 +6398,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6622,6 +6652,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6919,6 +6950,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7220,6 +7252,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7521,6 +7554,7 @@ define amdgpu_kernel void @flat_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7822,6 +7856,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8123,6 +8158,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8424,6 +8460,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8725,6 +8762,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9026,6 +9064,7 @@ define amdgpu_kernel void @flat_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9327,6 +9366,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9628,6 +9668,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9929,6 +9970,7 @@ define amdgpu_kernel void @flat_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10230,6 +10272,7 @@ define amdgpu_kernel void @flat_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10531,6 +10574,7 @@ define amdgpu_kernel void @flat_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10832,6 +10876,7 @@ define amdgpu_kernel void @flat_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11133,6 +11178,7 @@ define amdgpu_kernel void @flat_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11334,6 +11380,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11527,6 +11574,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11720,6 +11768,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11913,6 +11962,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -12074,6 +12124,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12234,6 +12285,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12394,6 +12446,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_store(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12554,6 +12607,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12714,6 +12768,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -12874,6 +12929,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13034,6 +13090,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13194,6 +13251,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13354,6 +13412,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13558,6 +13617,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13765,6 +13825,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13972,6 +14033,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14223,6 +14285,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14476,6 +14539,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14729,6 +14793,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14982,6 +15047,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15235,6 +15301,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15488,6 +15555,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15741,6 +15809,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15994,6 +16063,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16247,6 +16317,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16500,6 +16571,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16753,6 +16825,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17006,6 +17079,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17259,6 +17333,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17512,6 +17587,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17765,6 +17841,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18062,6 +18139,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18363,6 +18441,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18664,6 +18743,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18965,6 +19045,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19266,6 +19347,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19567,6 +19649,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19868,6 +19951,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20169,6 +20253,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20470,6 +20555,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20771,6 +20857,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21072,6 +21159,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21373,6 +21461,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21674,6 +21763,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21975,6 +22065,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22276,6 +22367,7 @@ define amdgpu_kernel void @flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index d5b37650ae9cc..1ee6067ec1f2c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -191,6 +191,7 @@ define amdgpu_kernel void @flat_system_unordered_load(
 ;
 ; GFX1250-LABEL: flat_system_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -384,6 +385,7 @@ define amdgpu_kernel void @flat_system_monotonic_load(
 ;
 ; GFX1250-LABEL: flat_system_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -594,6 +596,7 @@ define amdgpu_kernel void @flat_system_acquire_load(
 ;
 ; GFX1250-LABEL: flat_system_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -827,6 +830,7 @@ define amdgpu_kernel void @flat_system_seq_cst_load(
 ;
 ; GFX1250-LABEL: flat_system_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -991,6 +995,7 @@ define amdgpu_kernel void @flat_system_unordered_store(
 ;
 ; GFX1250-LABEL: flat_system_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1151,6 +1156,7 @@ define amdgpu_kernel void @flat_system_monotonic_store(
 ;
 ; GFX1250-LABEL: flat_system_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1339,6 +1345,7 @@ define amdgpu_kernel void @flat_system_release_store(
 ;
 ; GFX1250-LABEL: flat_system_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1530,6 +1537,7 @@ define amdgpu_kernel void @flat_system_seq_cst_store(
 ;
 ; GFX1250-LABEL: flat_system_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1693,6 +1701,7 @@ define amdgpu_kernel void @flat_system_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_system_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -1886,6 +1895,7 @@ define amdgpu_kernel void @flat_system_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_system_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2076,6 +2086,7 @@ define amdgpu_kernel void @flat_system_release_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_system_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2300,6 +2311,7 @@ define amdgpu_kernel void @flat_system_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_system_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2526,6 +2538,7 @@ define amdgpu_kernel void @flat_system_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_system_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2752,6 +2765,7 @@ define amdgpu_kernel void @flat_system_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_system_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3005,6 +3019,7 @@ define amdgpu_kernel void @flat_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_system_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3261,6 +3276,7 @@ define amdgpu_kernel void @flat_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_system_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3516,6 +3532,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3802,6 +3819,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4085,6 +4103,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4402,6 +4421,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4721,6 +4741,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5012,6 +5033,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5300,6 +5322,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5616,6 +5639,7 @@ define amdgpu_kernel void @flat_system_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5935,6 +5959,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6254,6 +6279,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6573,6 +6599,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6892,6 +6919,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7211,6 +7239,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7530,6 +7559,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7849,6 +7879,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8151,6 +8182,7 @@ define amdgpu_kernel void @flat_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8469,6 +8501,7 @@ define amdgpu_kernel void @flat_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8799,6 +8832,7 @@ define amdgpu_kernel void @flat_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9148,6 +9182,7 @@ define amdgpu_kernel void @flat_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9498,6 +9533,7 @@ define amdgpu_kernel void @flat_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9820,6 +9856,7 @@ define amdgpu_kernel void @flat_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10139,6 +10176,7 @@ define amdgpu_kernel void @flat_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10486,6 +10524,7 @@ define amdgpu_kernel void @flat_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10836,6 +10875,7 @@ define amdgpu_kernel void @flat_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11186,6 +11226,7 @@ define amdgpu_kernel void @flat_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11536,6 +11577,7 @@ define amdgpu_kernel void @flat_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11886,6 +11928,7 @@ define amdgpu_kernel void @flat_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -12236,6 +12279,7 @@ define amdgpu_kernel void @flat_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -12586,6 +12630,7 @@ define amdgpu_kernel void @flat_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -12936,6 +12981,7 @@ define amdgpu_kernel void @flat_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13141,6 +13187,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: flat_system_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -13334,6 +13381,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: flat_system_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -13554,6 +13602,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -13798,6 +13847,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: flat_system_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -13963,6 +14013,7 @@ define amdgpu_kernel void @flat_system_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: flat_system_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -14123,6 +14174,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: flat_system_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -14311,6 +14363,7 @@ define amdgpu_kernel void @flat_system_one_as_release_store(
 ;
 ; GFX1250-LABEL: flat_system_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -14502,6 +14555,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: flat_system_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -14665,6 +14719,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_system_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14854,6 +14909,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15044,6 +15100,7 @@ define amdgpu_kernel void @flat_system_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_system_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15264,6 +15321,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15486,6 +15544,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_system_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15722,6 +15781,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15986,6 +16046,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -16253,6 +16314,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -16509,6 +16571,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16791,6 +16854,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17074,6 +17138,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17387,6 +17452,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17702,6 +17768,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17989,6 +18056,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18273,6 +18341,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18585,6 +18654,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18900,6 +18970,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19215,6 +19286,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19530,6 +19602,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19845,6 +19918,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20160,6 +20234,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20475,6 +20550,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20790,6 +20866,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21092,6 +21169,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21420,6 +21498,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21751,6 +21830,7 @@ define amdgpu_kernel void @flat_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22110,6 +22190,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22471,6 +22552,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22804,6 +22886,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -23134,6 +23217,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -23492,6 +23576,7 @@ define amdgpu_kernel void @flat_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -23853,6 +23938,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -24214,6 +24300,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -24575,6 +24662,7 @@ define amdgpu_kernel void @flat_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -24936,6 +25024,7 @@ define amdgpu_kernel void @flat_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -25297,6 +25386,7 @@ define amdgpu_kernel void @flat_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -25658,6 +25748,7 @@ define amdgpu_kernel void @flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -26019,6 +26110,7 @@ define amdgpu_kernel void @flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index c59b0ee83e955..3c69363ad8d5b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -147,6 +147,7 @@ define amdgpu_kernel void @flat_nontemporal_load_0(
 ;
 ; GFX1250-LABEL: flat_nontemporal_load_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -414,6 +415,7 @@ define amdgpu_kernel void @flat_nontemporal_load_1(
 ;
 ; GFX1250-LABEL: flat_nontemporal_load_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -577,6 +579,7 @@ define amdgpu_kernel void @flat_nontemporal_store_0(
 ;
 ; GFX1250-LABEL: flat_nontemporal_store_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -841,6 +844,7 @@ define amdgpu_kernel void @flat_nontemporal_store_1(
 ;
 ; GFX1250-LABEL: flat_nontemporal_store_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -996,6 +1000,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_acquire_load(
 ;
 ; GFX1250-LABEL: flat_volatile_workgroup_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -1131,6 +1136,7 @@ define amdgpu_kernel void @flat_volatile_workgroup_release_store(
 ;
 ; GFX1250-LABEL: flat_volatile_workgroup_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index b8e324ff5f458..f5f2f5b403a76 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -191,6 +191,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_load(
 ;
 ; GFX1250-LABEL: flat_wavefront_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -384,6 +385,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_load(
 ;
 ; GFX1250-LABEL: flat_wavefront_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -577,6 +579,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_load(
 ;
 ; GFX1250-LABEL: flat_wavefront_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -770,6 +773,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_load(
 ;
 ; GFX1250-LABEL: flat_wavefront_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -931,6 +935,7 @@ define amdgpu_kernel void @flat_wavefront_unordered_store(
 ;
 ; GFX1250-LABEL: flat_wavefront_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1091,6 +1096,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_store(
 ;
 ; GFX1250-LABEL: flat_wavefront_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1251,6 +1257,7 @@ define amdgpu_kernel void @flat_wavefront_release_store(
 ;
 ; GFX1250-LABEL: flat_wavefront_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1411,6 +1418,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_store(
 ;
 ; GFX1250-LABEL: flat_wavefront_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1571,6 +1579,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_wavefront_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -1731,6 +1740,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_wavefront_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -1891,6 +1901,7 @@ define amdgpu_kernel void @flat_wavefront_release_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_wavefront_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2051,6 +2062,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_wavefront_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2211,6 +2223,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_wavefront_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2415,6 +2428,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_wavefront_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2622,6 +2636,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_wavefront_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2829,6 +2844,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_wavefront_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3080,6 +3096,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3333,6 +3350,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3586,6 +3604,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3839,6 +3858,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4092,6 +4112,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4345,6 +4366,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4598,6 +4620,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4851,6 +4874,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5104,6 +5128,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5357,6 +5382,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5610,6 +5636,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5863,6 +5890,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6116,6 +6144,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6369,6 +6398,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6622,6 +6652,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6919,6 +6950,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7220,6 +7252,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7521,6 +7554,7 @@ define amdgpu_kernel void @flat_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7822,6 +7856,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8123,6 +8158,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8424,6 +8460,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8725,6 +8762,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9026,6 +9064,7 @@ define amdgpu_kernel void @flat_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9327,6 +9366,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9628,6 +9668,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9929,6 +9970,7 @@ define amdgpu_kernel void @flat_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10230,6 +10272,7 @@ define amdgpu_kernel void @flat_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10531,6 +10574,7 @@ define amdgpu_kernel void @flat_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10832,6 +10876,7 @@ define amdgpu_kernel void @flat_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11133,6 +11178,7 @@ define amdgpu_kernel void @flat_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11334,6 +11380,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11527,6 +11574,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11720,6 +11768,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11913,6 +11962,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -12074,6 +12124,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12234,6 +12285,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12394,6 +12446,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_store(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12554,6 +12607,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12714,6 +12768,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -12874,6 +12929,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13034,6 +13090,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13194,6 +13251,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13354,6 +13412,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13558,6 +13617,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13765,6 +13825,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13972,6 +14033,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14223,6 +14285,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14476,6 +14539,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14729,6 +14793,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14982,6 +15047,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15235,6 +15301,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15488,6 +15555,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15741,6 +15809,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15994,6 +16063,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16247,6 +16317,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16500,6 +16571,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16753,6 +16825,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17006,6 +17079,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17259,6 +17333,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17512,6 +17587,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17765,6 +17841,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18062,6 +18139,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18363,6 +18441,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18664,6 +18743,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18965,6 +19045,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19266,6 +19347,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19567,6 +19649,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19868,6 +19951,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20169,6 +20253,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20470,6 +20555,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20771,6 +20857,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21072,6 +21159,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21373,6 +21461,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21674,6 +21763,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21975,6 +22065,7 @@ define amdgpu_kernel void @flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index d44e7fff2359f..11883bff6f269 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -191,6 +191,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_load(
 ;
 ; GFX1250-LABEL: flat_workgroup_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -384,6 +385,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_load(
 ;
 ; GFX1250-LABEL: flat_workgroup_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -589,6 +591,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_load(
 ;
 ; GFX1250-LABEL: flat_workgroup_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -813,6 +816,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_load(
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -976,6 +980,7 @@ define amdgpu_kernel void @flat_workgroup_unordered_store(
 ;
 ; GFX1250-LABEL: flat_workgroup_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1136,6 +1141,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_store(
 ;
 ; GFX1250-LABEL: flat_workgroup_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1318,6 +1324,7 @@ define amdgpu_kernel void @flat_workgroup_release_store(
 ;
 ; GFX1250-LABEL: flat_workgroup_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1502,6 +1509,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_store(
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1664,6 +1672,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_workgroup_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -1843,6 +1852,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_workgroup_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2026,6 +2036,7 @@ define amdgpu_kernel void @flat_workgroup_release_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_workgroup_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2231,6 +2242,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_workgroup_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2437,6 +2449,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2656,6 +2669,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_workgroup_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2894,6 +2908,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_workgroup_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3134,6 +3149,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3387,6 +3403,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3659,6 +3676,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3935,6 +3953,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4233,6 +4252,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4532,6 +4552,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4807,6 +4828,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5080,6 +5102,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5377,6 +5400,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5676,6 +5700,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5975,6 +6000,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6274,6 +6300,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6574,6 +6601,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6887,6 +6915,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7210,6 +7239,7 @@ define amdgpu_kernel void @flat_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7544,6 +7574,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7878,6 +7909,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8193,6 +8225,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8506,6 +8539,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8838,6 +8872,7 @@ define amdgpu_kernel void @flat_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9172,6 +9207,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9506,6 +9542,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9843,6 +9880,7 @@ define amdgpu_kernel void @flat_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10180,6 +10218,7 @@ define amdgpu_kernel void @flat_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10514,6 +10553,7 @@ define amdgpu_kernel void @flat_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10848,6 +10888,7 @@ define amdgpu_kernel void @flat_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11182,6 +11223,7 @@ define amdgpu_kernel void @flat_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11385,6 +11427,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11578,6 +11621,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11779,6 +11823,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -12001,6 +12046,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -12164,6 +12210,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12324,6 +12371,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12502,6 +12550,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_store(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12682,6 +12731,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12844,6 +12894,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13014,6 +13065,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13193,6 +13245,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13386,6 +13439,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13580,6 +13634,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13795,6 +13850,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14031,6 +14087,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14269,6 +14326,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14522,6 +14580,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14785,6 +14844,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15057,6 +15117,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15343,6 +15404,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15630,6 +15692,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15896,6 +15959,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16160,6 +16224,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16445,6 +16510,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16732,6 +16798,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17019,6 +17086,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17303,6 +17371,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17587,6 +17656,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17874,6 +17944,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18161,6 +18232,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18448,6 +18520,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18748,6 +18821,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19057,6 +19131,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19376,6 +19451,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19708,6 +19784,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20040,6 +20117,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20351,6 +20429,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20660,6 +20739,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20990,6 +21070,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21322,6 +21403,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21654,6 +21736,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21983,6 +22066,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22312,6 +22396,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22644,6 +22729,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22976,6 +23062,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -23308,6 +23395,7 @@ define amdgpu_kernel void @flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index 493f985c84701..74fd60a259ef9 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -194,6 +194,7 @@ define amdgpu_kernel void @global_agent_unordered_load(
 ;
 ; GFX1250-LABEL: global_agent_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -389,6 +390,7 @@ define amdgpu_kernel void @global_agent_monotonic_load(
 ;
 ; GFX1250-LABEL: global_agent_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -600,6 +602,7 @@ define amdgpu_kernel void @global_agent_acquire_load(
 ;
 ; GFX1250-LABEL: global_agent_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -827,6 +830,7 @@ define amdgpu_kernel void @global_agent_seq_cst_load(
 ;
 ; GFX1250-LABEL: global_agent_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -998,6 +1002,7 @@ define amdgpu_kernel void @global_agent_unordered_store(
 ;
 ; GFX1250-LABEL: global_agent_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1165,6 +1170,7 @@ define amdgpu_kernel void @global_agent_monotonic_store(
 ;
 ; GFX1250-LABEL: global_agent_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1357,6 +1363,7 @@ define amdgpu_kernel void @global_agent_release_store(
 ;
 ; GFX1250-LABEL: global_agent_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1552,6 +1559,7 @@ define amdgpu_kernel void @global_agent_seq_cst_store(
 ;
 ; GFX1250-LABEL: global_agent_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1720,6 +1728,7 @@ define amdgpu_kernel void @global_agent_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: global_agent_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -1914,6 +1923,7 @@ define amdgpu_kernel void @global_agent_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: global_agent_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2106,6 +2116,7 @@ define amdgpu_kernel void @global_agent_release_atomicrmw(
 ;
 ; GFX1250-LABEL: global_agent_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2328,6 +2339,7 @@ define amdgpu_kernel void @global_agent_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: global_agent_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2552,6 +2564,7 @@ define amdgpu_kernel void @global_agent_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: global_agent_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2766,6 +2779,7 @@ define amdgpu_kernel void @global_agent_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_agent_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3004,6 +3018,7 @@ define amdgpu_kernel void @global_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_agent_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3245,6 +3260,7 @@ define amdgpu_kernel void @global_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_agent_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3481,6 +3497,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3744,6 +3761,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4005,6 +4023,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4296,6 +4315,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4589,6 +4609,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4857,6 +4878,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5122,6 +5144,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5412,6 +5435,7 @@ define amdgpu_kernel void @global_agent_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5705,6 +5729,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5998,6 +6023,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6291,6 +6317,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6584,6 +6611,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6877,6 +6905,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7170,6 +7199,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7463,6 +7493,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7732,6 +7763,7 @@ define amdgpu_kernel void @global_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8016,6 +8048,7 @@ define amdgpu_kernel void @global_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8310,6 +8343,7 @@ define amdgpu_kernel void @global_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8622,6 +8656,7 @@ define amdgpu_kernel void @global_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8935,6 +8970,7 @@ define amdgpu_kernel void @global_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9223,6 +9259,7 @@ define amdgpu_kernel void @global_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9508,6 +9545,7 @@ define amdgpu_kernel void @global_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9818,6 +9856,7 @@ define amdgpu_kernel void @global_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10131,6 +10170,7 @@ define amdgpu_kernel void @global_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10444,6 +10484,7 @@ define amdgpu_kernel void @global_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10757,6 +10798,7 @@ define amdgpu_kernel void @global_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11070,6 +11112,7 @@ define amdgpu_kernel void @global_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11383,6 +11426,7 @@ define amdgpu_kernel void @global_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11696,6 +11740,7 @@ define amdgpu_kernel void @global_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -12009,6 +12054,7 @@ define amdgpu_kernel void @global_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -12216,6 +12262,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: global_agent_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -12411,6 +12458,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: global_agent_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -12622,6 +12670,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -12849,6 +12898,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: global_agent_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -13020,6 +13070,7 @@ define amdgpu_kernel void @global_agent_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: global_agent_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -13187,6 +13238,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: global_agent_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -13379,6 +13431,7 @@ define amdgpu_kernel void @global_agent_one_as_release_store(
 ;
 ; GFX1250-LABEL: global_agent_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -13574,6 +13627,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: global_agent_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -13742,6 +13796,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: global_agent_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13936,6 +13991,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14128,6 +14184,7 @@ define amdgpu_kernel void @global_agent_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: global_agent_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14350,6 +14407,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14574,6 +14632,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: global_agent_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14788,6 +14847,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15026,6 +15086,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15267,6 +15328,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15503,6 +15565,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15766,6 +15829,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16027,6 +16091,7 @@ define amdgpu_kernel void @global_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16318,6 +16383,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16611,6 +16677,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16879,6 +16946,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17144,6 +17212,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17434,6 +17503,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17727,6 +17797,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18020,6 +18091,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18313,6 +18385,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18606,6 +18679,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18899,6 +18973,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19192,6 +19267,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19485,6 +19561,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19754,6 +19831,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20038,6 +20116,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20348,6 +20427,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20661,6 +20741,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20949,6 +21030,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21234,6 +21316,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21544,6 +21627,7 @@ define amdgpu_kernel void @global_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21857,6 +21941,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22170,6 +22255,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22483,6 +22569,7 @@ define amdgpu_kernel void @global_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22796,6 +22883,7 @@ define amdgpu_kernel void @global_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -23109,6 +23197,7 @@ define amdgpu_kernel void @global_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -23422,6 +23511,7 @@ define amdgpu_kernel void @global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -23735,6 +23825,7 @@ define amdgpu_kernel void @global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll
index 40257df684990..d0d527eb363ea 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-cluster.ll
@@ -194,6 +194,7 @@ define amdgpu_kernel void @global_cluster_unordered_load(
 ;
 ; GFX1250-LABEL: global_cluster_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -389,6 +390,7 @@ define amdgpu_kernel void @global_cluster_monotonic_load(
 ;
 ; GFX1250-LABEL: global_cluster_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -600,6 +602,7 @@ define amdgpu_kernel void @global_cluster_acquire_load(
 ;
 ; GFX1250-LABEL: global_cluster_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -827,6 +830,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_load(
 ;
 ; GFX1250-LABEL: global_cluster_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -998,6 +1002,7 @@ define amdgpu_kernel void @global_cluster_unordered_store(
 ;
 ; GFX1250-LABEL: global_cluster_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1165,6 +1170,7 @@ define amdgpu_kernel void @global_cluster_monotonic_store(
 ;
 ; GFX1250-LABEL: global_cluster_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1357,6 +1363,7 @@ define amdgpu_kernel void @global_cluster_release_store(
 ;
 ; GFX1250-LABEL: global_cluster_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1551,6 +1558,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_store(
 ;
 ; GFX1250-LABEL: global_cluster_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1718,6 +1726,7 @@ define amdgpu_kernel void @global_cluster_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: global_cluster_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -1912,6 +1921,7 @@ define amdgpu_kernel void @global_cluster_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: global_cluster_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2104,6 +2114,7 @@ define amdgpu_kernel void @global_cluster_release_atomicrmw(
 ;
 ; GFX1250-LABEL: global_cluster_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2325,6 +2336,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: global_cluster_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2548,6 +2560,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: global_cluster_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2761,6 +2774,7 @@ define amdgpu_kernel void @global_cluster_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_cluster_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2999,6 +3013,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_cluster_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3239,6 +3254,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_cluster_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3474,6 +3490,7 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3737,6 +3754,7 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3998,6 +4016,7 @@ define amdgpu_kernel void @global_cluster_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4288,6 +4307,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4580,6 +4600,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4847,6 +4868,7 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5112,6 +5134,7 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5402,6 +5425,7 @@ define amdgpu_kernel void @global_cluster_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5694,6 +5718,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5986,6 +6011,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6278,6 +6304,7 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6570,6 +6597,7 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6862,6 +6890,7 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7154,6 +7183,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7446,6 +7476,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7714,6 +7745,7 @@ define amdgpu_kernel void @global_cluster_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7998,6 +8030,7 @@ define amdgpu_kernel void @global_cluster_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8292,6 +8325,7 @@ define amdgpu_kernel void @global_cluster_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8603,6 +8637,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8915,6 +8950,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9202,6 +9238,7 @@ define amdgpu_kernel void @global_cluster_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9487,6 +9524,7 @@ define amdgpu_kernel void @global_cluster_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9797,6 +9835,7 @@ define amdgpu_kernel void @global_cluster_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10109,6 +10148,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10421,6 +10461,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10733,6 +10774,7 @@ define amdgpu_kernel void @global_cluster_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11045,6 +11087,7 @@ define amdgpu_kernel void @global_cluster_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11357,6 +11400,7 @@ define amdgpu_kernel void @global_cluster_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11669,6 +11713,7 @@ define amdgpu_kernel void @global_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11981,6 +12026,7 @@ define amdgpu_kernel void @global_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -12187,6 +12233,7 @@ define amdgpu_kernel void @global_cluster_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -12382,6 +12429,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -12593,6 +12641,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -12820,6 +12869,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -12991,6 +13041,7 @@ define amdgpu_kernel void @global_cluster_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -13158,6 +13209,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -13350,6 +13402,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_store(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -13544,6 +13597,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -13711,6 +13765,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13905,6 +13960,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14097,6 +14153,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14318,6 +14375,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14541,6 +14599,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14754,6 +14813,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14992,6 +15052,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15232,6 +15293,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -15467,6 +15529,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15730,6 +15793,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15991,6 +16055,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16281,6 +16346,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16573,6 +16639,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16840,6 +16907,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17105,6 +17173,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17395,6 +17464,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17687,6 +17757,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17979,6 +18050,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18271,6 +18343,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18563,6 +18636,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18855,6 +18929,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19147,6 +19222,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19439,6 +19515,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19707,6 +19784,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX1250-LABEL: global_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19991,6 +20069,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20301,6 +20380,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20613,6 +20693,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20900,6 +20981,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21185,6 +21267,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21495,6 +21578,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21807,6 +21891,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22119,6 +22204,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22431,6 +22517,7 @@ define amdgpu_kernel void @global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22743,6 +22830,7 @@ define amdgpu_kernel void @global_cluster_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -23055,6 +23143,7 @@ define amdgpu_kernel void @global_cluster_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -23367,6 +23456,7 @@ define amdgpu_kernel void @global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -23679,6 +23769,7 @@ define amdgpu_kernel void @global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
index ca7802d295e0b..5082378c6b770 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-lastuse.ll
@@ -18,6 +18,7 @@ define amdgpu_kernel void @global_last_use_load_0(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX1250-LABEL: global_last_use_load_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -53,6 +54,7 @@ define amdgpu_kernel void @global_last_use_load_1(ptr addrspace(1) %in, ptr addr
 ;
 ; GFX1250-LABEL: global_last_use_load_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -89,6 +91,7 @@ define amdgpu_kernel void @global_last_use_and_volatile_load(ptr addrspace(1) %i
 ;
 ; GFX1250-LABEL: global_last_use_and_volatile_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -123,6 +126,7 @@ define amdgpu_kernel void @global_last_use_and_nontemporal_load(ptr addrspace(1)
 ;
 ; GFX1250-LABEL: global_last_use_and_nontemporal_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 72cbbc0283545..d0328484e159d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -193,6 +193,7 @@ define amdgpu_kernel void @global_nontemporal_load_0(
 ;
 ; GFX1250-LABEL: global_nontemporal_load_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -458,6 +459,7 @@ define amdgpu_kernel void @global_nontemporal_load_1(
 ;
 ; GFX1250-LABEL: global_nontemporal_load_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -658,6 +660,7 @@ define amdgpu_kernel void @global_nontemporal_store_0(
 ;
 ; GFX1250-LABEL: global_nontemporal_store_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -897,6 +900,7 @@ define amdgpu_kernel void @global_nontemporal_store_1(
 ;
 ; GFX1250-LABEL: global_nontemporal_store_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1101,6 +1105,7 @@ define amdgpu_kernel void @global_nontemporal_volatile_load(
 ;
 ; GFX1250-LABEL: global_nontemporal_volatile_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index e7f7b1d196be7..3e54b23c82f8c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -194,6 +194,7 @@ define amdgpu_kernel void @global_singlethread_unordered_load(
 ;
 ; GFX1250-LABEL: global_singlethread_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -389,6 +390,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_load(
 ;
 ; GFX1250-LABEL: global_singlethread_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -584,6 +586,7 @@ define amdgpu_kernel void @global_singlethread_acquire_load(
 ;
 ; GFX1250-LABEL: global_singlethread_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -779,6 +782,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_load(
 ;
 ; GFX1250-LABEL: global_singlethread_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -947,6 +951,7 @@ define amdgpu_kernel void @global_singlethread_unordered_store(
 ;
 ; GFX1250-LABEL: global_singlethread_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1114,6 +1119,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_store(
 ;
 ; GFX1250-LABEL: global_singlethread_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1281,6 +1287,7 @@ define amdgpu_kernel void @global_singlethread_release_store(
 ;
 ; GFX1250-LABEL: global_singlethread_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1448,6 +1455,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_store(
 ;
 ; GFX1250-LABEL: global_singlethread_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1613,6 +1621,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: global_singlethread_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -1778,6 +1787,7 @@ define amdgpu_kernel void @global_singlethread_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: global_singlethread_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -1943,6 +1953,7 @@ define amdgpu_kernel void @global_singlethread_release_atomicrmw(
 ;
 ; GFX1250-LABEL: global_singlethread_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2108,6 +2119,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: global_singlethread_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2273,6 +2285,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: global_singlethread_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2466,6 +2479,7 @@ define amdgpu_kernel void @global_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_singlethread_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2662,6 +2676,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_singlethread_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2858,6 +2873,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_singlethread_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3090,6 +3106,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3324,6 +3341,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3558,6 +3576,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3792,6 +3811,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4026,6 +4046,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4260,6 +4281,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4494,6 +4516,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4728,6 +4751,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4962,6 +4986,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5196,6 +5221,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5430,6 +5456,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5664,6 +5691,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5898,6 +5926,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6132,6 +6161,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6366,6 +6396,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6630,6 +6661,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6898,6 +6930,7 @@ define amdgpu_kernel void @global_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7166,6 +7199,7 @@ define amdgpu_kernel void @global_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7434,6 +7468,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7702,6 +7737,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7970,6 +8006,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8238,6 +8275,7 @@ define amdgpu_kernel void @global_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8506,6 +8544,7 @@ define amdgpu_kernel void @global_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8774,6 +8813,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9042,6 +9082,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9310,6 +9351,7 @@ define amdgpu_kernel void @global_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9578,6 +9620,7 @@ define amdgpu_kernel void @global_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9846,6 +9889,7 @@ define amdgpu_kernel void @global_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10114,6 +10158,7 @@ define amdgpu_kernel void @global_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10382,6 +10427,7 @@ define amdgpu_kernel void @global_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10585,6 +10631,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -10780,6 +10827,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -10975,6 +11023,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11170,6 +11219,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11338,6 +11388,7 @@ define amdgpu_kernel void @global_singlethread_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -11505,6 +11556,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -11672,6 +11724,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_store(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -11839,6 +11892,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12004,6 +12058,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -12169,6 +12224,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -12334,6 +12390,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -12499,6 +12556,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -12664,6 +12722,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -12857,6 +12916,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13053,6 +13113,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13249,6 +13310,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13481,6 +13543,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_cmpxch
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13715,6 +13778,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13949,6 +14013,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14183,6 +14248,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14417,6 +14483,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14651,6 +14718,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14885,6 +14953,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15119,6 +15188,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15353,6 +15423,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15587,6 +15658,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15821,6 +15893,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16055,6 +16128,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16289,6 +16363,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16523,6 +16598,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16757,6 +16833,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17021,6 +17098,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_monotonic_ret_cm
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17289,6 +17367,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_monotonic_ret_cmpx
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17557,6 +17636,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_monotonic_ret_cmpx
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17825,6 +17905,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_monotonic_ret_cmpx
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18093,6 +18174,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_monotonic_ret_cmpx
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18361,6 +18443,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_acquire_ret_cmpx
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18629,6 +18712,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_acquire_ret_cmpxch
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18897,6 +18981,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_acquire_ret_cmpxch
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19165,6 +19250,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_acquire_ret_cmpxch
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19433,6 +19519,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_acquire_ret_cmpxch
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19701,6 +19788,7 @@ define amdgpu_kernel void @global_singlethread_one_as_monotonic_seq_cst_ret_cmpx
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19969,6 +20057,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acquire_seq_cst_ret_cmpxch
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20237,6 +20326,7 @@ define amdgpu_kernel void @global_singlethread_one_as_release_seq_cst_ret_cmpxch
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20505,6 +20595,7 @@ define amdgpu_kernel void @global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxch
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20773,6 +20864,7 @@ define amdgpu_kernel void @global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxch
 ;
 ; GFX1250-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index ee5a8bf742fe7..8155f1b9843a2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -194,6 +194,7 @@ define amdgpu_kernel void @global_system_unordered_load(
 ;
 ; GFX1250-LABEL: global_system_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -389,6 +390,7 @@ define amdgpu_kernel void @global_system_monotonic_load(
 ;
 ; GFX1250-LABEL: global_system_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -602,6 +604,7 @@ define amdgpu_kernel void @global_system_acquire_load(
 ;
 ; GFX1250-LABEL: global_system_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -831,6 +834,7 @@ define amdgpu_kernel void @global_system_seq_cst_load(
 ;
 ; GFX1250-LABEL: global_system_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -1002,6 +1006,7 @@ define amdgpu_kernel void @global_system_unordered_store(
 ;
 ; GFX1250-LABEL: global_system_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1169,6 +1174,7 @@ define amdgpu_kernel void @global_system_monotonic_store(
 ;
 ; GFX1250-LABEL: global_system_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1365,6 +1371,7 @@ define amdgpu_kernel void @global_system_release_store(
 ;
 ; GFX1250-LABEL: global_system_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1564,6 +1571,7 @@ define amdgpu_kernel void @global_system_seq_cst_store(
 ;
 ; GFX1250-LABEL: global_system_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1732,6 +1740,7 @@ define amdgpu_kernel void @global_system_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: global_system_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -1928,6 +1937,7 @@ define amdgpu_kernel void @global_system_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: global_system_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2124,6 +2134,7 @@ define amdgpu_kernel void @global_system_release_atomicrmw(
 ;
 ; GFX1250-LABEL: global_system_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2352,6 +2363,7 @@ define amdgpu_kernel void @global_system_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: global_system_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2582,6 +2594,7 @@ define amdgpu_kernel void @global_system_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: global_system_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2798,6 +2811,7 @@ define amdgpu_kernel void @global_system_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_system_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3042,6 +3056,7 @@ define amdgpu_kernel void @global_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_system_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3289,6 +3304,7 @@ define amdgpu_kernel void @global_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_system_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3525,6 +3541,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3790,6 +3807,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4055,6 +4073,7 @@ define amdgpu_kernel void @global_system_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4352,6 +4371,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4651,6 +4671,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4921,6 +4942,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5188,6 +5210,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5484,6 +5507,7 @@ define amdgpu_kernel void @global_system_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5783,6 +5807,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6082,6 +6107,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6381,6 +6407,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6650,6 +6677,7 @@ define amdgpu_kernel void @global_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6936,6 +6964,7 @@ define amdgpu_kernel void @global_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7252,6 +7281,7 @@ define amdgpu_kernel void @global_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7571,6 +7601,7 @@ define amdgpu_kernel void @global_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7861,6 +7892,7 @@ define amdgpu_kernel void @global_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8148,6 +8180,7 @@ define amdgpu_kernel void @global_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8464,6 +8497,7 @@ define amdgpu_kernel void @global_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8783,6 +8817,7 @@ define amdgpu_kernel void @global_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9102,6 +9137,7 @@ define amdgpu_kernel void @global_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9421,6 +9457,7 @@ define amdgpu_kernel void @global_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9740,6 +9777,7 @@ define amdgpu_kernel void @global_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10059,6 +10097,7 @@ define amdgpu_kernel void @global_system_relese_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_relese_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10378,6 +10417,7 @@ define amdgpu_kernel void @global_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10697,6 +10737,7 @@ define amdgpu_kernel void @global_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10904,6 +10945,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: global_system_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11099,6 +11141,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: global_system_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11312,6 +11355,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: global_system_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11541,6 +11585,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: global_system_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11712,6 +11757,7 @@ define amdgpu_kernel void @global_system_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: global_system_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -11879,6 +11925,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: global_system_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12075,6 +12122,7 @@ define amdgpu_kernel void @global_system_one_as_release_store(
 ;
 ; GFX1250-LABEL: global_system_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12274,6 +12322,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: global_system_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12442,6 +12491,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: global_system_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -12638,6 +12688,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: global_system_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -12834,6 +12885,7 @@ define amdgpu_kernel void @global_system_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: global_system_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13062,6 +13114,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: global_system_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13292,6 +13345,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: global_system_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13508,6 +13562,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_system_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13752,6 +13807,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13999,6 +14055,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14235,6 +14292,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14500,6 +14558,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14765,6 +14824,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15062,6 +15122,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15361,6 +15422,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15631,6 +15693,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15898,6 +15961,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16194,6 +16258,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16493,6 +16558,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16792,6 +16858,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17091,6 +17158,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17390,6 +17458,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17689,6 +17758,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17988,6 +18058,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18287,6 +18358,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18556,6 +18628,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18842,6 +18915,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19140,6 +19214,7 @@ define amdgpu_kernel void @global_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19458,6 +19533,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19777,6 +19853,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20067,6 +20144,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20354,6 +20432,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20670,6 +20749,7 @@ define amdgpu_kernel void @global_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20989,6 +21069,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21308,6 +21389,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21627,6 +21709,7 @@ define amdgpu_kernel void @global_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21946,6 +22029,7 @@ define amdgpu_kernel void @global_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22265,6 +22349,7 @@ define amdgpu_kernel void @global_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22584,6 +22669,7 @@ define amdgpu_kernel void @global_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22903,6 +22989,7 @@ define amdgpu_kernel void @global_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index 2a40ee532be98..0b240887e4025 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -150,6 +150,7 @@ define amdgpu_kernel void @global_volatile_load_0(
 ;
 ; GFX1250-LABEL: global_volatile_load_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -354,6 +355,7 @@ define amdgpu_kernel void @global_volatile_load_1(
 ;
 ; GFX1250-LABEL: global_volatile_load_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -525,6 +527,7 @@ define amdgpu_kernel void @global_volatile_store_0(
 ;
 ; GFX1250-LABEL: global_volatile_store_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -724,6 +727,7 @@ define amdgpu_kernel void @global_volatile_store_1(
 ;
 ; GFX1250-LABEL: global_volatile_store_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -884,6 +888,7 @@ define amdgpu_kernel void @global_volatile_workgroup_acquire_load(
 ;
 ; GFX1250-LABEL: global_volatile_workgroup_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -1031,6 +1036,7 @@ define amdgpu_kernel void @global_volatile_workgroup_release_store(
 ;
 ; GFX1250-LABEL: global_volatile_workgroup_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index 09eb062d876f6..7f68ab6b3faf2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -194,6 +194,7 @@ define amdgpu_kernel void @global_wavefront_unordered_load(
 ;
 ; GFX1250-LABEL: global_wavefront_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -389,6 +390,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_load(
 ;
 ; GFX1250-LABEL: global_wavefront_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -584,6 +586,7 @@ define amdgpu_kernel void @global_wavefront_acquire_load(
 ;
 ; GFX1250-LABEL: global_wavefront_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -779,6 +782,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_load(
 ;
 ; GFX1250-LABEL: global_wavefront_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -947,6 +951,7 @@ define amdgpu_kernel void @global_wavefront_unordered_store(
 ;
 ; GFX1250-LABEL: global_wavefront_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1114,6 +1119,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_store(
 ;
 ; GFX1250-LABEL: global_wavefront_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1281,6 +1287,7 @@ define amdgpu_kernel void @global_wavefront_release_store(
 ;
 ; GFX1250-LABEL: global_wavefront_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1448,6 +1455,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_store(
 ;
 ; GFX1250-LABEL: global_wavefront_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1613,6 +1621,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: global_wavefront_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -1778,6 +1787,7 @@ define amdgpu_kernel void @global_wavefront_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: global_wavefront_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -1943,6 +1953,7 @@ define amdgpu_kernel void @global_wavefront_release_atomicrmw(
 ;
 ; GFX1250-LABEL: global_wavefront_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2108,6 +2119,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: global_wavefront_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2273,6 +2285,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: global_wavefront_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2466,6 +2479,7 @@ define amdgpu_kernel void @global_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_wavefront_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2662,6 +2676,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_wavefront_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2858,6 +2873,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_wavefront_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3090,6 +3106,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3324,6 +3341,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3558,6 +3576,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3792,6 +3811,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4026,6 +4046,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4260,6 +4281,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4494,6 +4516,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4728,6 +4751,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4962,6 +4986,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5196,6 +5221,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5430,6 +5456,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5664,6 +5691,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5898,6 +5926,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6132,6 +6161,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6366,6 +6396,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6630,6 +6661,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6898,6 +6930,7 @@ define amdgpu_kernel void @global_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7166,6 +7199,7 @@ define amdgpu_kernel void @global_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7434,6 +7468,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7702,6 +7737,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7970,6 +8006,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8238,6 +8275,7 @@ define amdgpu_kernel void @global_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8506,6 +8544,7 @@ define amdgpu_kernel void @global_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8774,6 +8813,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9042,6 +9082,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9310,6 +9351,7 @@ define amdgpu_kernel void @global_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9578,6 +9620,7 @@ define amdgpu_kernel void @global_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9846,6 +9889,7 @@ define amdgpu_kernel void @global_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10114,6 +10158,7 @@ define amdgpu_kernel void @global_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10382,6 +10427,7 @@ define amdgpu_kernel void @global_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10585,6 +10631,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -10780,6 +10827,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -10975,6 +11023,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11170,6 +11219,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11338,6 +11388,7 @@ define amdgpu_kernel void @global_wavefront_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -11505,6 +11556,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -11672,6 +11724,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_store(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -11839,6 +11892,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12004,6 +12058,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -12169,6 +12224,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -12334,6 +12390,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -12499,6 +12556,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -12664,6 +12722,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -12857,6 +12916,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13053,6 +13113,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13249,6 +13310,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13481,6 +13543,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13715,6 +13778,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -13949,6 +14013,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14183,6 +14248,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14417,6 +14483,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14651,6 +14718,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14885,6 +14953,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15119,6 +15188,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15353,6 +15423,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15587,6 +15658,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15821,6 +15893,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16055,6 +16128,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16289,6 +16363,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16523,6 +16598,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16757,6 +16833,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17021,6 +17098,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_monotonic_ret_cmpxc
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17289,6 +17367,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_monotonic_ret_cmpxchg
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17557,6 +17636,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_monotonic_ret_cmpxchg
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17825,6 +17905,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18093,6 +18174,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18361,6 +18443,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_acquire_ret_cmpxchg
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18629,6 +18712,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18897,6 +18981,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19165,6 +19250,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19433,6 +19519,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19701,6 +19788,7 @@ define amdgpu_kernel void @global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19969,6 +20057,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20237,6 +20326,7 @@ define amdgpu_kernel void @global_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20505,6 +20595,7 @@ define amdgpu_kernel void @global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20773,6 +20864,7 @@ define amdgpu_kernel void @global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 868b438151558..3a36890e5088b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -194,6 +194,7 @@ define amdgpu_kernel void @global_workgroup_unordered_load(
 ;
 ; GFX1250-LABEL: global_workgroup_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -389,6 +390,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_load(
 ;
 ; GFX1250-LABEL: global_workgroup_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -589,6 +591,7 @@ define amdgpu_kernel void @global_workgroup_acquire_load(
 ;
 ; GFX1250-LABEL: global_workgroup_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -804,6 +807,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_load(
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -974,6 +978,7 @@ define amdgpu_kernel void @global_workgroup_unordered_store(
 ;
 ; GFX1250-LABEL: global_workgroup_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1141,6 +1146,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_store(
 ;
 ; GFX1250-LABEL: global_workgroup_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1331,6 +1337,7 @@ define amdgpu_kernel void @global_workgroup_release_store(
 ;
 ; GFX1250-LABEL: global_workgroup_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1523,6 +1530,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_store(
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -1690,6 +1698,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: global_workgroup_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -1865,6 +1874,7 @@ define amdgpu_kernel void @global_workgroup_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: global_workgroup_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2054,6 +2064,7 @@ define amdgpu_kernel void @global_workgroup_release_atomicrmw(
 ;
 ; GFX1250-LABEL: global_workgroup_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2257,6 +2268,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2461,6 +2473,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2662,6 +2675,7 @@ define amdgpu_kernel void @global_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_workgroup_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -2886,6 +2900,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3112,6 +3127,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -3346,6 +3362,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3590,6 +3607,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -3848,6 +3866,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4120,6 +4139,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4393,6 +4413,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4640,6 +4661,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -4885,6 +4907,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5156,6 +5179,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5429,6 +5453,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5702,6 +5727,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -5972,6 +5998,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6242,6 +6269,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6515,6 +6543,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -6788,6 +6817,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7061,6 +7091,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7328,6 +7359,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7601,6 +7633,7 @@ define amdgpu_kernel void @global_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -7892,6 +7925,7 @@ define amdgpu_kernel void @global_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8190,6 +8224,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8488,6 +8523,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -8763,6 +8799,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9036,6 +9073,7 @@ define amdgpu_kernel void @global_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9332,6 +9370,7 @@ define amdgpu_kernel void @global_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9630,6 +9669,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -9928,6 +9968,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10226,6 +10267,7 @@ define amdgpu_kernel void @global_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10524,6 +10566,7 @@ define amdgpu_kernel void @global_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -10822,6 +10865,7 @@ define amdgpu_kernel void @global_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11120,6 +11164,7 @@ define amdgpu_kernel void @global_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11418,6 +11463,7 @@ define amdgpu_kernel void @global_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -11623,6 +11669,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -11818,6 +11865,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -12018,6 +12066,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -12230,6 +12279,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -12400,6 +12450,7 @@ define amdgpu_kernel void @global_workgroup_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12567,6 +12618,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12752,6 +12804,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_store(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -12939,6 +12992,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -13106,6 +13160,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13281,6 +13336,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13465,6 +13521,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13663,6 +13720,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -13862,6 +13920,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14063,6 +14122,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14282,6 +14342,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14503,6 +14564,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x8
@@ -14737,6 +14799,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -14981,6 +15044,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15234,6 +15298,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15501,6 +15566,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -15769,6 +15835,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16016,6 +16083,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16261,6 +16329,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16527,6 +16596,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -16795,6 +16865,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17063,6 +17134,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17328,6 +17400,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17593,6 +17666,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -17861,6 +17935,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18129,6 +18204,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18397,6 +18473,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18664,6 +18741,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_monotonic_ret_cmpxc
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -18937,6 +19015,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_monotonic_ret_cmpxchg
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19223,6 +19302,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_monotonic_ret_cmpxchg
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19516,6 +19596,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -19809,6 +19890,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20084,6 +20166,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_acquire_ret_cmpxchg
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20357,6 +20440,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20648,6 +20732,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -20941,6 +21026,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21234,6 +21320,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21527,6 +21614,7 @@ define amdgpu_kernel void @global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -21820,6 +21908,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22113,6 +22202,7 @@ define amdgpu_kernel void @global_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22406,6 +22496,7 @@ define amdgpu_kernel void @global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8
@@ -22699,6 +22790,7 @@ define amdgpu_kernel void @global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 712109d2f67f9..336b980e544cf 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -181,6 +181,7 @@ define amdgpu_kernel void @local_agent_unordered_load(
 ;
 ; GFX1250-LABEL: local_agent_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -364,6 +365,7 @@ define amdgpu_kernel void @local_agent_monotonic_load(
 ;
 ; GFX1250-LABEL: local_agent_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -552,6 +554,7 @@ define amdgpu_kernel void @local_agent_acquire_load(
 ;
 ; GFX1250-LABEL: local_agent_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -763,6 +766,7 @@ define amdgpu_kernel void @local_agent_seq_cst_load(
 ;
 ; GFX1250-LABEL: local_agent_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -918,6 +922,7 @@ define amdgpu_kernel void @local_agent_unordered_store(
 ;
 ; GFX1250-LABEL: local_agent_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1068,6 +1073,7 @@ define amdgpu_kernel void @local_agent_monotonic_store(
 ;
 ; GFX1250-LABEL: local_agent_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1241,6 +1247,7 @@ define amdgpu_kernel void @local_agent_release_store(
 ;
 ; GFX1250-LABEL: local_agent_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1416,6 +1423,7 @@ define amdgpu_kernel void @local_agent_seq_cst_store(
 ;
 ; GFX1250-LABEL: local_agent_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1568,6 +1576,7 @@ define amdgpu_kernel void @local_agent_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: local_agent_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1734,6 +1743,7 @@ define amdgpu_kernel void @local_agent_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: local_agent_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1908,6 +1918,7 @@ define amdgpu_kernel void @local_agent_release_atomicrmw(
 ;
 ; GFX1250-LABEL: local_agent_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2099,6 +2110,7 @@ define amdgpu_kernel void @local_agent_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: local_agent_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2291,6 +2303,7 @@ define amdgpu_kernel void @local_agent_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: local_agent_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2491,6 +2504,7 @@ define amdgpu_kernel void @local_agent_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_agent_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2715,6 +2729,7 @@ define amdgpu_kernel void @local_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_agent_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2941,6 +2956,7 @@ define amdgpu_kernel void @local_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_agent_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -3123,6 +3139,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3318,6 +3335,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3521,6 +3539,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3741,6 +3760,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3962,6 +3982,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4160,6 +4181,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4356,6 +4378,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4575,6 +4598,7 @@ define amdgpu_kernel void @local_agent_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4796,6 +4820,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5017,6 +5042,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5238,6 +5264,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5459,6 +5486,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5680,6 +5708,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5901,6 +5930,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6122,6 +6152,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6346,6 +6377,7 @@ define amdgpu_kernel void @local_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6577,6 +6609,7 @@ define amdgpu_kernel void @local_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6826,6 +6859,7 @@ define amdgpu_kernel void @local_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7082,6 +7116,7 @@ define amdgpu_kernel void @local_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7338,6 +7373,7 @@ define amdgpu_kernel void @local_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7571,6 +7607,7 @@ define amdgpu_kernel void @local_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7802,6 +7839,7 @@ define amdgpu_kernel void @local_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8056,6 +8094,7 @@ define amdgpu_kernel void @local_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8312,6 +8351,7 @@ define amdgpu_kernel void @local_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8568,6 +8608,7 @@ define amdgpu_kernel void @local_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8824,6 +8865,7 @@ define amdgpu_kernel void @local_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9080,6 +9122,7 @@ define amdgpu_kernel void @local_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9336,6 +9379,7 @@ define amdgpu_kernel void @local_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9592,6 +9636,7 @@ define amdgpu_kernel void @local_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9848,6 +9893,7 @@ define amdgpu_kernel void @local_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10038,6 +10084,7 @@ define amdgpu_kernel void @local_agent_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: local_agent_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10221,6 +10268,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: local_agent_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10404,6 +10452,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10587,6 +10636,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: local_agent_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10740,6 +10790,7 @@ define amdgpu_kernel void @local_agent_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: local_agent_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10890,6 +10941,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: local_agent_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11040,6 +11092,7 @@ define amdgpu_kernel void @local_agent_one_as_release_store(
 ;
 ; GFX1250-LABEL: local_agent_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11190,6 +11243,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: local_agent_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11340,6 +11394,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: local_agent_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11490,6 +11545,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11640,6 +11696,7 @@ define amdgpu_kernel void @local_agent_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: local_agent_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11790,6 +11847,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11940,6 +11998,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: local_agent_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -12132,6 +12191,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -12328,6 +12388,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -12524,6 +12585,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -12704,6 +12766,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12883,6 +12946,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13062,6 +13126,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13241,6 +13306,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13420,6 +13486,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13599,6 +13666,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13778,6 +13846,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13957,6 +14026,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14136,6 +14206,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14315,6 +14386,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14494,6 +14566,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14673,6 +14746,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14852,6 +14926,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15031,6 +15106,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15210,6 +15286,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15431,6 +15508,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15657,6 +15735,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15883,6 +15962,7 @@ define amdgpu_kernel void @local_agent_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16109,6 +16189,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16335,6 +16416,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16561,6 +16643,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16787,6 +16870,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17013,6 +17097,7 @@ define amdgpu_kernel void @local_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17239,6 +17324,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17465,6 +17551,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17691,6 +17778,7 @@ define amdgpu_kernel void @local_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17917,6 +18005,7 @@ define amdgpu_kernel void @local_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -18143,6 +18232,7 @@ define amdgpu_kernel void @local_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -18369,6 +18459,7 @@ define amdgpu_kernel void @local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -18595,6 +18686,7 @@ define amdgpu_kernel void @local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
index 6d1e4e6a96119..7d11fd74ec863 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-cluster.ll
@@ -181,6 +181,7 @@ define amdgpu_kernel void @local_cluster_unordered_load(
 ;
 ; GFX1250-LABEL: local_cluster_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -364,6 +365,7 @@ define amdgpu_kernel void @local_cluster_monotonic_load(
 ;
 ; GFX1250-LABEL: local_cluster_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -552,6 +554,7 @@ define amdgpu_kernel void @local_cluster_acquire_load(
 ;
 ; GFX1250-LABEL: local_cluster_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -763,6 +766,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_load(
 ;
 ; GFX1250-LABEL: local_cluster_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -918,6 +922,7 @@ define amdgpu_kernel void @local_cluster_unordered_store(
 ;
 ; GFX1250-LABEL: local_cluster_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1068,6 +1073,7 @@ define amdgpu_kernel void @local_cluster_monotonic_store(
 ;
 ; GFX1250-LABEL: local_cluster_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1241,6 +1247,7 @@ define amdgpu_kernel void @local_cluster_release_store(
 ;
 ; GFX1250-LABEL: local_cluster_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1416,6 +1423,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_store(
 ;
 ; GFX1250-LABEL: local_cluster_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1568,6 +1576,7 @@ define amdgpu_kernel void @local_cluster_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: local_cluster_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1734,6 +1743,7 @@ define amdgpu_kernel void @local_cluster_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: local_cluster_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1908,6 +1918,7 @@ define amdgpu_kernel void @local_cluster_release_atomicrmw(
 ;
 ; GFX1250-LABEL: local_cluster_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2099,6 +2110,7 @@ define amdgpu_kernel void @local_cluster_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: local_cluster_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2291,6 +2303,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: local_cluster_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2491,6 +2504,7 @@ define amdgpu_kernel void @local_cluster_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_cluster_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2715,6 +2729,7 @@ define amdgpu_kernel void @local_cluster_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_cluster_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2941,6 +2956,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_cluster_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -3123,6 +3139,7 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3318,6 +3335,7 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3521,6 +3539,7 @@ define amdgpu_kernel void @local_cluster_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3741,6 +3760,7 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3962,6 +3982,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4160,6 +4181,7 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4356,6 +4378,7 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4575,6 +4598,7 @@ define amdgpu_kernel void @local_cluster_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4796,6 +4820,7 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5017,6 +5042,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5238,6 +5264,7 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5459,6 +5486,7 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5680,6 +5708,7 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5901,6 +5930,7 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6122,6 +6152,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6346,6 +6377,7 @@ define amdgpu_kernel void @local_cluster_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6577,6 +6609,7 @@ define amdgpu_kernel void @local_cluster_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6826,6 +6859,7 @@ define amdgpu_kernel void @local_cluster_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7082,6 +7116,7 @@ define amdgpu_kernel void @local_cluster_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7338,6 +7373,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7571,6 +7607,7 @@ define amdgpu_kernel void @local_cluster_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7802,6 +7839,7 @@ define amdgpu_kernel void @local_cluster_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8056,6 +8094,7 @@ define amdgpu_kernel void @local_cluster_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8312,6 +8351,7 @@ define amdgpu_kernel void @local_cluster_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8568,6 +8608,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8824,6 +8865,7 @@ define amdgpu_kernel void @local_cluster_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9080,6 +9122,7 @@ define amdgpu_kernel void @local_cluster_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9336,6 +9379,7 @@ define amdgpu_kernel void @local_cluster_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9592,6 +9636,7 @@ define amdgpu_kernel void @local_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9848,6 +9893,7 @@ define amdgpu_kernel void @local_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10038,6 +10084,7 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10221,6 +10268,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10404,6 +10452,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10587,6 +10636,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10740,6 +10790,7 @@ define amdgpu_kernel void @local_cluster_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10890,6 +10941,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11040,6 +11092,7 @@ define amdgpu_kernel void @local_cluster_one_as_release_store(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11190,6 +11243,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11340,6 +11394,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11490,6 +11545,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11640,6 +11696,7 @@ define amdgpu_kernel void @local_cluster_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11790,6 +11847,7 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11940,6 +11998,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -12132,6 +12191,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -12328,6 +12388,7 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -12524,6 +12585,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -12704,6 +12766,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12883,6 +12946,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13062,6 +13126,7 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13241,6 +13306,7 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13420,6 +13486,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13599,6 +13666,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13778,6 +13846,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13957,6 +14026,7 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14136,6 +14206,7 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14315,6 +14386,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14494,6 +14566,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14673,6 +14746,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14852,6 +14926,7 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15031,6 +15106,7 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15210,6 +15286,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15431,6 +15508,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15657,6 +15735,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15883,6 +15962,7 @@ define amdgpu_kernel void @local_cluster_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16109,6 +16189,7 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16335,6 +16416,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16561,6 +16643,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16787,6 +16870,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17013,6 +17097,7 @@ define amdgpu_kernel void @local_cluster_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17239,6 +17324,7 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17465,6 +17551,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17691,6 +17778,7 @@ define amdgpu_kernel void @local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17917,6 +18005,7 @@ define amdgpu_kernel void @local_cluster_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -18143,6 +18232,7 @@ define amdgpu_kernel void @local_cluster_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -18369,6 +18459,7 @@ define amdgpu_kernel void @local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -18595,6 +18686,7 @@ define amdgpu_kernel void @local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index 9888204b997a9..550afd4141742 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -197,6 +197,7 @@ define amdgpu_kernel void @local_nontemporal_load_0(
 ;
 ; GFX1250-LABEL: local_nontemporal_load_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -428,6 +429,7 @@ define amdgpu_kernel void @local_nontemporal_load_1(
 ;
 ; GFX1250-LABEL: local_nontemporal_load_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -612,6 +614,7 @@ define amdgpu_kernel void @local_nontemporal_store_0(
 ;
 ; GFX1250-LABEL: local_nontemporal_store_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -813,6 +816,7 @@ define amdgpu_kernel void @local_nontemporal_store_1(
 ;
 ; GFX1250-LABEL: local_nontemporal_store_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1017,6 +1021,7 @@ define amdgpu_kernel void @local_nontemporal_volatile_load(
 ;
 ; GFX1250-LABEL: local_nontemporal_volatile_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
index 1800acbbf605b..e72a28b3efc66 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
@@ -181,6 +181,7 @@ define amdgpu_kernel void @local_singlethread_unordered_load(
 ;
 ; GFX1250-LABEL: local_singlethread_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -364,6 +365,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_load(
 ;
 ; GFX1250-LABEL: local_singlethread_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -547,6 +549,7 @@ define amdgpu_kernel void @local_singlethread_acquire_load(
 ;
 ; GFX1250-LABEL: local_singlethread_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -730,6 +733,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_load(
 ;
 ; GFX1250-LABEL: local_singlethread_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -883,6 +887,7 @@ define amdgpu_kernel void @local_singlethread_unordered_store(
 ;
 ; GFX1250-LABEL: local_singlethread_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1033,6 +1038,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_store(
 ;
 ; GFX1250-LABEL: local_singlethread_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1183,6 +1189,7 @@ define amdgpu_kernel void @local_singlethread_release_store(
 ;
 ; GFX1250-LABEL: local_singlethread_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1333,6 +1340,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_store(
 ;
 ; GFX1250-LABEL: local_singlethread_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1483,6 +1491,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: local_singlethread_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1633,6 +1642,7 @@ define amdgpu_kernel void @local_singlethread_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: local_singlethread_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1783,6 +1793,7 @@ define amdgpu_kernel void @local_singlethread_release_atomicrmw(
 ;
 ; GFX1250-LABEL: local_singlethread_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1933,6 +1944,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: local_singlethread_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2083,6 +2095,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: local_singlethread_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2275,6 +2288,7 @@ define amdgpu_kernel void @local_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_singlethread_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2471,6 +2485,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_singlethread_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2667,6 +2682,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_singlethread_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2847,6 +2863,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3026,6 +3043,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3205,6 +3223,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3384,6 +3403,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3563,6 +3583,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3742,6 +3763,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3921,6 +3943,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4100,6 +4123,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4279,6 +4303,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4458,6 +4483,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4637,6 +4663,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4816,6 +4843,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4995,6 +5023,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5174,6 +5203,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5353,6 +5383,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5574,6 +5605,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -5800,6 +5832,7 @@ define amdgpu_kernel void @local_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6026,6 +6059,7 @@ define amdgpu_kernel void @local_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6252,6 +6286,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6478,6 +6513,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6704,6 +6740,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6930,6 +6967,7 @@ define amdgpu_kernel void @local_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7156,6 +7194,7 @@ define amdgpu_kernel void @local_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7382,6 +7421,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7608,6 +7648,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7834,6 +7875,7 @@ define amdgpu_kernel void @local_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8060,6 +8102,7 @@ define amdgpu_kernel void @local_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8286,6 +8329,7 @@ define amdgpu_kernel void @local_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8512,6 +8556,7 @@ define amdgpu_kernel void @local_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8738,6 +8783,7 @@ define amdgpu_kernel void @local_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8926,6 +8972,7 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -9109,6 +9156,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -9292,6 +9340,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -9475,6 +9524,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -9628,6 +9678,7 @@ define amdgpu_kernel void @local_singlethread_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -9778,6 +9829,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -9928,6 +9980,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_store(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10078,6 +10131,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10228,6 +10282,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10378,6 +10433,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10528,6 +10584,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10678,6 +10735,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10828,6 +10886,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11020,6 +11079,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11216,6 +11276,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11412,6 +11473,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11592,6 +11654,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_cmpxchg
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11771,6 +11834,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11950,6 +12014,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12129,6 +12194,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12308,6 +12374,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12487,6 +12554,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12666,6 +12734,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12845,6 +12914,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13024,6 +13094,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13203,6 +13274,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13382,6 +13454,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13561,6 +13634,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13740,6 +13814,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13919,6 +13994,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14098,6 +14174,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14319,6 +14396,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_monotonic_ret_cmp
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14545,6 +14623,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_monotonic_ret_cmpxc
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14771,6 +14850,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_monotonic_ret_cmpxc
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14997,6 +15077,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_monotonic_ret_cmpxc
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15223,6 +15304,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_monotonic_ret_cmpxc
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15449,6 +15531,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_acquire_ret_cmpxc
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15675,6 +15758,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_acquire_ret_cmpxchg
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15901,6 +15985,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_acquire_ret_cmpxchg
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16127,6 +16212,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16353,6 +16439,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16579,6 +16666,7 @@ define amdgpu_kernel void @local_singlethread_one_as_monotonic_seq_cst_ret_cmpxc
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16805,6 +16893,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17031,6 +17120,7 @@ define amdgpu_kernel void @local_singlethread_one_as_release_seq_cst_ret_cmpxchg
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17257,6 +17347,7 @@ define amdgpu_kernel void @local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17483,6 +17574,7 @@ define amdgpu_kernel void @local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg
 ;
 ; GFX1250-LABEL: local_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index 577d2ca9514bb..3c9f00829c9c8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -181,6 +181,7 @@ define amdgpu_kernel void @local_system_unordered_load(
 ;
 ; GFX1250-LABEL: local_system_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -364,6 +365,7 @@ define amdgpu_kernel void @local_system_monotonic_load(
 ;
 ; GFX1250-LABEL: local_system_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -552,6 +554,7 @@ define amdgpu_kernel void @local_system_acquire_load(
 ;
 ; GFX1250-LABEL: local_system_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -763,6 +766,7 @@ define amdgpu_kernel void @local_system_seq_cst_load(
 ;
 ; GFX1250-LABEL: local_system_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -918,6 +922,7 @@ define amdgpu_kernel void @local_system_unordered_store(
 ;
 ; GFX1250-LABEL: local_system_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1068,6 +1073,7 @@ define amdgpu_kernel void @local_system_monotonic_store(
 ;
 ; GFX1250-LABEL: local_system_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1241,6 +1247,7 @@ define amdgpu_kernel void @local_system_release_store(
 ;
 ; GFX1250-LABEL: local_system_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1416,6 +1423,7 @@ define amdgpu_kernel void @local_system_seq_cst_store(
 ;
 ; GFX1250-LABEL: local_system_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1568,6 +1576,7 @@ define amdgpu_kernel void @local_system_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: local_system_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1734,6 +1743,7 @@ define amdgpu_kernel void @local_system_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: local_system_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1908,6 +1918,7 @@ define amdgpu_kernel void @local_system_release_atomicrmw(
 ;
 ; GFX1250-LABEL: local_system_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2099,6 +2110,7 @@ define amdgpu_kernel void @local_system_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: local_system_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2291,6 +2303,7 @@ define amdgpu_kernel void @local_system_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: local_system_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2491,6 +2504,7 @@ define amdgpu_kernel void @local_system_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_system_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2715,6 +2729,7 @@ define amdgpu_kernel void @local_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_system_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2941,6 +2956,7 @@ define amdgpu_kernel void @local_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_system_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -3123,6 +3139,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3318,6 +3335,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3521,6 +3539,7 @@ define amdgpu_kernel void @local_system_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3741,6 +3760,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3962,6 +3982,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4160,6 +4181,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4356,6 +4378,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4575,6 +4598,7 @@ define amdgpu_kernel void @local_system_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4796,6 +4820,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5017,6 +5042,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5238,6 +5264,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5459,6 +5486,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5680,6 +5708,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5901,6 +5930,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6122,6 +6152,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6346,6 +6377,7 @@ define amdgpu_kernel void @local_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6577,6 +6609,7 @@ define amdgpu_kernel void @local_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6826,6 +6859,7 @@ define amdgpu_kernel void @local_system_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7082,6 +7116,7 @@ define amdgpu_kernel void @local_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7338,6 +7373,7 @@ define amdgpu_kernel void @local_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7571,6 +7607,7 @@ define amdgpu_kernel void @local_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7802,6 +7839,7 @@ define amdgpu_kernel void @local_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8056,6 +8094,7 @@ define amdgpu_kernel void @local_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8312,6 +8351,7 @@ define amdgpu_kernel void @local_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8568,6 +8608,7 @@ define amdgpu_kernel void @local_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8824,6 +8865,7 @@ define amdgpu_kernel void @local_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9080,6 +9122,7 @@ define amdgpu_kernel void @local_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9336,6 +9379,7 @@ define amdgpu_kernel void @local_system_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9592,6 +9636,7 @@ define amdgpu_kernel void @local_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9848,6 +9893,7 @@ define amdgpu_kernel void @local_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10038,6 +10084,7 @@ define amdgpu_kernel void @local_system_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: local_system_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10221,6 +10268,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: local_system_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10404,6 +10452,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: local_system_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10587,6 +10636,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: local_system_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10740,6 +10790,7 @@ define amdgpu_kernel void @local_system_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: local_system_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10890,6 +10941,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: local_system_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11040,6 +11092,7 @@ define amdgpu_kernel void @local_system_one_as_release_store(
 ;
 ; GFX1250-LABEL: local_system_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11190,6 +11243,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: local_system_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11340,6 +11394,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: local_system_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11490,6 +11545,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: local_system_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11640,6 +11696,7 @@ define amdgpu_kernel void @local_system_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: local_system_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11790,6 +11847,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: local_system_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11940,6 +11998,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: local_system_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -12132,6 +12191,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_system_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -12328,6 +12388,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -12524,6 +12585,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -12704,6 +12766,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12883,6 +12946,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13062,6 +13126,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13241,6 +13306,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13420,6 +13486,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13599,6 +13666,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13778,6 +13846,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13957,6 +14026,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14136,6 +14206,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14315,6 +14386,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14494,6 +14566,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14673,6 +14746,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14852,6 +14926,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15031,6 +15106,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15210,6 +15286,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15431,6 +15508,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15657,6 +15735,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15883,6 +15962,7 @@ define amdgpu_kernel void @local_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16109,6 +16189,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16335,6 +16416,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16561,6 +16643,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16787,6 +16870,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17013,6 +17097,7 @@ define amdgpu_kernel void @local_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17239,6 +17324,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17465,6 +17551,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17691,6 +17778,7 @@ define amdgpu_kernel void @local_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17917,6 +18005,7 @@ define amdgpu_kernel void @local_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -18143,6 +18232,7 @@ define amdgpu_kernel void @local_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -18369,6 +18459,7 @@ define amdgpu_kernel void @local_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -18595,6 +18686,7 @@ define amdgpu_kernel void @local_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 33c516c61e42c..f7ac29ef69477 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -145,6 +145,7 @@ define amdgpu_kernel void @local_volatile_load_0(
 ;
 ; GFX1250-LABEL: local_volatile_load_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -316,6 +317,7 @@ define amdgpu_kernel void @local_volatile_load_1(
 ;
 ; GFX1250-LABEL: local_volatile_load_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -452,6 +454,7 @@ define amdgpu_kernel void @local_volatile_store_0(
 ;
 ; GFX1250-LABEL: local_volatile_store_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -597,6 +600,7 @@ define amdgpu_kernel void @local_volatile_store_1(
 ;
 ; GFX1250-LABEL: local_volatile_store_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -740,6 +744,7 @@ define amdgpu_kernel void @local_volatile_workgroup_acquire_load(
 ;
 ; GFX1250-LABEL: local_volatile_workgroup_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -872,6 +877,7 @@ define amdgpu_kernel void @local_volatile_workgroup_release_store(
 ;
 ; GFX1250-LABEL: local_volatile_workgroup_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
index 7e345ed6e2716..73a557f85b05d 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
@@ -181,6 +181,7 @@ define amdgpu_kernel void @local_wavefront_unordered_load(
 ;
 ; GFX1250-LABEL: local_wavefront_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -364,6 +365,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_load(
 ;
 ; GFX1250-LABEL: local_wavefront_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -547,6 +549,7 @@ define amdgpu_kernel void @local_wavefront_acquire_load(
 ;
 ; GFX1250-LABEL: local_wavefront_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -730,6 +733,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_load(
 ;
 ; GFX1250-LABEL: local_wavefront_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -883,6 +887,7 @@ define amdgpu_kernel void @local_wavefront_unordered_store(
 ;
 ; GFX1250-LABEL: local_wavefront_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1033,6 +1038,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_store(
 ;
 ; GFX1250-LABEL: local_wavefront_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1183,6 +1189,7 @@ define amdgpu_kernel void @local_wavefront_release_store(
 ;
 ; GFX1250-LABEL: local_wavefront_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1333,6 +1340,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_store(
 ;
 ; GFX1250-LABEL: local_wavefront_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1483,6 +1491,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: local_wavefront_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1633,6 +1642,7 @@ define amdgpu_kernel void @local_wavefront_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: local_wavefront_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1783,6 +1793,7 @@ define amdgpu_kernel void @local_wavefront_release_atomicrmw(
 ;
 ; GFX1250-LABEL: local_wavefront_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1933,6 +1944,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: local_wavefront_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2083,6 +2095,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: local_wavefront_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2275,6 +2288,7 @@ define amdgpu_kernel void @local_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_wavefront_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2471,6 +2485,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_wavefront_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2667,6 +2682,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_wavefront_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2847,6 +2863,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3026,6 +3043,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3205,6 +3223,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3384,6 +3403,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3563,6 +3583,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3742,6 +3763,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3921,6 +3943,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4100,6 +4123,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4279,6 +4303,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4458,6 +4483,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4637,6 +4663,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4816,6 +4843,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4995,6 +5023,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5174,6 +5203,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5353,6 +5383,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5574,6 +5605,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -5800,6 +5832,7 @@ define amdgpu_kernel void @local_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6026,6 +6059,7 @@ define amdgpu_kernel void @local_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6252,6 +6286,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6478,6 +6513,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6704,6 +6740,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6930,6 +6967,7 @@ define amdgpu_kernel void @local_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7156,6 +7194,7 @@ define amdgpu_kernel void @local_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7382,6 +7421,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7608,6 +7648,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7834,6 +7875,7 @@ define amdgpu_kernel void @local_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8060,6 +8102,7 @@ define amdgpu_kernel void @local_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8286,6 +8329,7 @@ define amdgpu_kernel void @local_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8512,6 +8556,7 @@ define amdgpu_kernel void @local_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8738,6 +8783,7 @@ define amdgpu_kernel void @local_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8926,6 +8972,7 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -9109,6 +9156,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -9292,6 +9340,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -9475,6 +9524,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -9628,6 +9678,7 @@ define amdgpu_kernel void @local_wavefront_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -9778,6 +9829,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -9928,6 +9980,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_store(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10078,6 +10131,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10228,6 +10282,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10378,6 +10433,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10528,6 +10584,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10678,6 +10735,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10828,6 +10886,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11020,6 +11079,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11216,6 +11276,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11412,6 +11473,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11592,6 +11654,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11771,6 +11834,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -11950,6 +12014,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12129,6 +12194,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12308,6 +12374,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12487,6 +12554,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12666,6 +12734,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12845,6 +12914,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13024,6 +13094,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13203,6 +13274,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13382,6 +13454,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13561,6 +13634,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13740,6 +13814,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13919,6 +13994,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14098,6 +14174,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14319,6 +14396,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_monotonic_ret_cmpxch
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14545,6 +14623,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14771,6 +14850,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -14997,6 +15077,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15223,6 +15304,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15449,6 +15531,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15675,6 +15758,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15901,6 +15985,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16127,6 +16212,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16353,6 +16439,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16579,6 +16666,7 @@ define amdgpu_kernel void @local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16805,6 +16893,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17031,6 +17120,7 @@ define amdgpu_kernel void @local_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17257,6 +17347,7 @@ define amdgpu_kernel void @local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17483,6 +17574,7 @@ define amdgpu_kernel void @local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index ab4d7834b23a5..44a965b753198 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -181,6 +181,7 @@ define amdgpu_kernel void @local_workgroup_unordered_load(
 ;
 ; GFX1250-LABEL: local_workgroup_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -364,6 +365,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_load(
 ;
 ; GFX1250-LABEL: local_workgroup_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -552,6 +554,7 @@ define amdgpu_kernel void @local_workgroup_acquire_load(
 ;
 ; GFX1250-LABEL: local_workgroup_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -763,6 +766,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_load(
 ;
 ; GFX1250-LABEL: local_workgroup_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -918,6 +922,7 @@ define amdgpu_kernel void @local_workgroup_unordered_store(
 ;
 ; GFX1250-LABEL: local_workgroup_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1068,6 +1073,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_store(
 ;
 ; GFX1250-LABEL: local_workgroup_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1241,6 +1247,7 @@ define amdgpu_kernel void @local_workgroup_release_store(
 ;
 ; GFX1250-LABEL: local_workgroup_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1416,6 +1423,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_store(
 ;
 ; GFX1250-LABEL: local_workgroup_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1568,6 +1576,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: local_workgroup_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1734,6 +1743,7 @@ define amdgpu_kernel void @local_workgroup_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: local_workgroup_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1908,6 +1918,7 @@ define amdgpu_kernel void @local_workgroup_release_atomicrmw(
 ;
 ; GFX1250-LABEL: local_workgroup_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2099,6 +2110,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: local_workgroup_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2291,6 +2303,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: local_workgroup_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2491,6 +2504,7 @@ define amdgpu_kernel void @local_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_workgroup_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2715,6 +2729,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_workgroup_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2941,6 +2956,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_workgroup_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -3123,6 +3139,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3318,6 +3335,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3521,6 +3539,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3741,6 +3760,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3962,6 +3982,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4160,6 +4181,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4356,6 +4378,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4575,6 +4598,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4796,6 +4820,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5017,6 +5042,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5238,6 +5264,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5459,6 +5486,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5680,6 +5708,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5901,6 +5930,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6122,6 +6152,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6346,6 +6377,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6577,6 +6609,7 @@ define amdgpu_kernel void @local_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6826,6 +6859,7 @@ define amdgpu_kernel void @local_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7082,6 +7116,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7338,6 +7373,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7571,6 +7607,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7802,6 +7839,7 @@ define amdgpu_kernel void @local_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8056,6 +8094,7 @@ define amdgpu_kernel void @local_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8312,6 +8351,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8568,6 +8608,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8824,6 +8865,7 @@ define amdgpu_kernel void @local_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9080,6 +9122,7 @@ define amdgpu_kernel void @local_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9336,6 +9379,7 @@ define amdgpu_kernel void @local_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9592,6 +9636,7 @@ define amdgpu_kernel void @local_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9848,6 +9893,7 @@ define amdgpu_kernel void @local_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10038,6 +10084,7 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10221,6 +10268,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10404,6 +10452,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10587,6 +10636,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10740,6 +10790,7 @@ define amdgpu_kernel void @local_workgroup_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -10890,6 +10941,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11040,6 +11092,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_store(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11190,6 +11243,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11340,6 +11394,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11490,6 +11545,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11640,6 +11696,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11790,6 +11847,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -11940,6 +11998,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -12132,6 +12191,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -12328,6 +12388,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -12524,6 +12585,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -12704,6 +12766,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -12883,6 +12946,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13062,6 +13126,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13241,6 +13306,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13420,6 +13486,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13599,6 +13666,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13778,6 +13846,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -13957,6 +14026,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14136,6 +14206,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14315,6 +14386,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14494,6 +14566,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14673,6 +14746,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14852,6 +14926,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15031,6 +15106,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15210,6 +15286,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15431,6 +15508,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_monotonic_ret_cmpxch
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15657,6 +15735,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -15883,6 +15962,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16109,6 +16189,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16335,6 +16416,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16561,6 +16643,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -16787,6 +16870,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17013,6 +17097,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17239,6 +17324,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17465,6 +17551,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17691,6 +17778,7 @@ define amdgpu_kernel void @local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -17917,6 +18005,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -18143,6 +18232,7 @@ define amdgpu_kernel void @local_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -18369,6 +18459,7 @@ define amdgpu_kernel void @local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -18595,6 +18686,7 @@ define amdgpu_kernel void @local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: local_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
index 06a0514404fbc..f520943983b3a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-agent.ll
@@ -179,6 +179,7 @@ define amdgpu_kernel void @private_agent_unordered_load(
 ;
 ; GFX1250-LABEL: private_agent_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -378,6 +379,7 @@ define amdgpu_kernel void @private_agent_monotonic_load(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -577,6 +579,7 @@ define amdgpu_kernel void @private_agent_acquire_load(
 ;
 ; GFX1250-LABEL: private_agent_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -777,6 +780,7 @@ define amdgpu_kernel void @private_agent_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -959,6 +963,7 @@ define amdgpu_kernel void @private_agent_unordered_store(
 ;
 ; GFX1250-LABEL: private_agent_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1137,6 +1142,7 @@ define amdgpu_kernel void @private_agent_monotonic_store(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1315,6 +1321,7 @@ define amdgpu_kernel void @private_agent_release_store(
 ;
 ; GFX1250-LABEL: private_agent_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1496,6 +1503,7 @@ define amdgpu_kernel void @private_agent_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1677,6 +1685,7 @@ define amdgpu_kernel void @private_agent_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1855,6 +1864,7 @@ define amdgpu_kernel void @private_agent_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2035,6 +2045,7 @@ define amdgpu_kernel void @private_agent_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2216,6 +2227,7 @@ define amdgpu_kernel void @private_agent_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2399,6 +2411,7 @@ define amdgpu_kernel void @private_agent_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2636,6 +2649,7 @@ define amdgpu_kernel void @private_agent_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -2872,6 +2886,7 @@ define amdgpu_kernel void @private_agent_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -3111,6 +3126,7 @@ define amdgpu_kernel void @private_agent_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -3380,6 +3396,7 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3649,6 +3666,7 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3920,6 +3938,7 @@ define amdgpu_kernel void @private_agent_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4192,6 +4211,7 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4466,6 +4486,7 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4740,6 +4761,7 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5011,6 +5033,7 @@ define amdgpu_kernel void @private_agent_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5282,6 +5305,7 @@ define amdgpu_kernel void @private_agent_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5556,6 +5580,7 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5830,6 +5855,7 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6104,6 +6130,7 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6378,6 +6405,7 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6652,6 +6680,7 @@ define amdgpu_kernel void @private_agent_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6926,6 +6955,7 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -7200,6 +7230,7 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -7502,6 +7533,7 @@ define amdgpu_kernel void @private_agent_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7803,6 +7835,7 @@ define amdgpu_kernel void @private_agent_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8105,6 +8138,7 @@ define amdgpu_kernel void @private_agent_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8409,6 +8443,7 @@ define amdgpu_kernel void @private_agent_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8714,6 +8749,7 @@ define amdgpu_kernel void @private_agent_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9019,6 +9055,7 @@ define amdgpu_kernel void @private_agent_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9321,6 +9358,7 @@ define amdgpu_kernel void @private_agent_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9623,6 +9661,7 @@ define amdgpu_kernel void @private_agent_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9928,6 +9967,7 @@ define amdgpu_kernel void @private_agent_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10233,6 +10273,7 @@ define amdgpu_kernel void @private_agent_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10538,6 +10579,7 @@ define amdgpu_kernel void @private_agent_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10843,6 +10885,7 @@ define amdgpu_kernel void @private_agent_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11148,6 +11191,7 @@ define amdgpu_kernel void @private_agent_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11453,6 +11497,7 @@ define amdgpu_kernel void @private_agent_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11758,6 +11803,7 @@ define amdgpu_kernel void @private_agent_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11971,6 +12017,7 @@ define amdgpu_kernel void @private_agent_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: private_agent_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12170,6 +12217,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: private_agent_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12369,6 +12417,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12570,6 +12619,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12753,6 +12803,7 @@ define amdgpu_kernel void @private_agent_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: private_agent_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12931,6 +12982,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: private_agent_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13109,6 +13161,7 @@ define amdgpu_kernel void @private_agent_one_as_release_store(
 ;
 ; GFX1250-LABEL: private_agent_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13290,6 +13343,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13471,6 +13525,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13649,6 +13704,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13829,6 +13885,7 @@ define amdgpu_kernel void @private_agent_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -14010,6 +14067,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -14193,6 +14251,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -14430,6 +14489,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -14667,6 +14727,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -14907,6 +14968,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -15177,6 +15239,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15446,6 +15509,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15717,6 +15781,7 @@ define amdgpu_kernel void @private_agent_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15989,6 +16054,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16263,6 +16329,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16537,6 +16604,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16808,6 +16876,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17079,6 +17148,7 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17353,6 +17423,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17627,6 +17698,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17901,6 +17973,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18175,6 +18248,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18449,6 +18523,7 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18723,6 +18798,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18997,6 +19073,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -19299,6 +19376,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -19600,6 +19678,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -19903,6 +19982,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20209,6 +20289,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20515,6 +20596,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20818,6 +20900,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21121,6 +21204,7 @@ define amdgpu_kernel void @private_agent_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21427,6 +21511,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21733,6 +21818,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22039,6 +22125,7 @@ define amdgpu_kernel void @private_agent_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22345,6 +22432,7 @@ define amdgpu_kernel void @private_agent_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22651,6 +22739,7 @@ define amdgpu_kernel void @private_agent_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22957,6 +23046,7 @@ define amdgpu_kernel void @private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -23263,6 +23353,7 @@ define amdgpu_kernel void @private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_agent_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll
index 8fcaeccbc0397..24f9fe0297c63 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-cluster.ll
@@ -179,6 +179,7 @@ define amdgpu_kernel void @private_cluster_unordered_load(
 ;
 ; GFX1250-LABEL: private_cluster_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -378,6 +379,7 @@ define amdgpu_kernel void @private_cluster_monotonic_load(
 ;
 ; GFX1250-LABEL: private_cluster_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -577,6 +579,7 @@ define amdgpu_kernel void @private_cluster_acquire_load(
 ;
 ; GFX1250-LABEL: private_cluster_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -777,6 +780,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_cluster_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -959,6 +963,7 @@ define amdgpu_kernel void @private_cluster_unordered_store(
 ;
 ; GFX1250-LABEL: private_cluster_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1137,6 +1142,7 @@ define amdgpu_kernel void @private_cluster_monotonic_store(
 ;
 ; GFX1250-LABEL: private_cluster_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1315,6 +1321,7 @@ define amdgpu_kernel void @private_cluster_release_store(
 ;
 ; GFX1250-LABEL: private_cluster_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1495,6 +1502,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_cluster_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1675,6 +1683,7 @@ define amdgpu_kernel void @private_cluster_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_cluster_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1853,6 +1862,7 @@ define amdgpu_kernel void @private_cluster_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_cluster_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2033,6 +2043,7 @@ define amdgpu_kernel void @private_cluster_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_cluster_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2213,6 +2224,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_cluster_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2395,6 +2407,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_cluster_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2631,6 +2644,7 @@ define amdgpu_kernel void @private_cluster_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_cluster_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -2867,6 +2881,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_cluster_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -3105,6 +3120,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_cluster_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -3373,6 +3389,7 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3642,6 +3659,7 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3913,6 +3931,7 @@ define amdgpu_kernel void @private_cluster_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4184,6 +4203,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4457,6 +4477,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4730,6 +4751,7 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5001,6 +5023,7 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5272,6 +5295,7 @@ define amdgpu_kernel void @private_cluster_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5545,6 +5569,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5818,6 +5843,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6091,6 +6117,7 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6364,6 +6391,7 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6637,6 +6665,7 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6910,6 +6939,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -7183,6 +7213,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -7484,6 +7515,7 @@ define amdgpu_kernel void @private_cluster_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7785,6 +7817,7 @@ define amdgpu_kernel void @private_cluster_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8087,6 +8120,7 @@ define amdgpu_kernel void @private_cluster_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8390,6 +8424,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8694,6 +8729,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8998,6 +9034,7 @@ define amdgpu_kernel void @private_cluster_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9300,6 +9337,7 @@ define amdgpu_kernel void @private_cluster_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9602,6 +9640,7 @@ define amdgpu_kernel void @private_cluster_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9906,6 +9945,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10210,6 +10250,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10514,6 +10555,7 @@ define amdgpu_kernel void @private_cluster_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10818,6 +10860,7 @@ define amdgpu_kernel void @private_cluster_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11122,6 +11165,7 @@ define amdgpu_kernel void @private_cluster_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11426,6 +11470,7 @@ define amdgpu_kernel void @private_cluster_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11730,6 +11775,7 @@ define amdgpu_kernel void @private_cluster_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11942,6 +11988,7 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12141,6 +12188,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12340,6 +12388,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12541,6 +12590,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12724,6 +12774,7 @@ define amdgpu_kernel void @private_cluster_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12902,6 +12953,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13080,6 +13132,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_store(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13260,6 +13313,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13440,6 +13494,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13618,6 +13673,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13798,6 +13854,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13978,6 +14035,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -14160,6 +14218,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -14396,6 +14455,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -14633,6 +14693,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -14872,6 +14933,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -15141,6 +15203,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15410,6 +15473,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15681,6 +15745,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15952,6 +16017,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16225,6 +16291,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16498,6 +16565,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16769,6 +16837,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17040,6 +17109,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17313,6 +17383,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17586,6 +17657,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17859,6 +17931,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18132,6 +18205,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18405,6 +18479,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18678,6 +18753,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18951,6 +19027,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -19252,6 +19329,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_monotonic_ret_cmpxch
 ;
 ; GFX1250-LABEL: private_cluster_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -19553,6 +19631,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -19856,6 +19935,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20161,6 +20241,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20466,6 +20547,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20769,6 +20851,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21072,6 +21155,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21377,6 +21461,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21682,6 +21767,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21987,6 +22073,7 @@ define amdgpu_kernel void @private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22292,6 +22379,7 @@ define amdgpu_kernel void @private_cluster_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22597,6 +22685,7 @@ define amdgpu_kernel void @private_cluster_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22902,6 +22991,7 @@ define amdgpu_kernel void @private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -23207,6 +23297,7 @@ define amdgpu_kernel void @private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_cluster_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
index 80ea48be0b893..bd84a87c2538f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-lastuse.ll
@@ -17,6 +17,7 @@ define amdgpu_kernel void @private_last_use_load_0(ptr addrspace(5) %in, ptr add
 ;
 ; GFX1250-LABEL: private_last_use_load_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -51,6 +52,7 @@ define amdgpu_kernel void @private_last_use_load_1(ptr addrspace(5) %in, ptr add
 ;
 ; GFX1250-LABEL: private_last_use_load_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -86,6 +88,7 @@ define amdgpu_kernel void @private_last_use_and_volatile_load(ptr addrspace(5) %
 ;
 ; GFX1250-LABEL: private_last_use_and_volatile_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -114,6 +117,7 @@ define amdgpu_kernel void @private_last_use_and_nontemporal_load(ptr addrspace(5
 ;
 ; GFX1250-LABEL: private_last_use_and_nontemporal_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index 6c19722ad6e33..ac16108076d8c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -205,6 +205,7 @@ define amdgpu_kernel void @private_nontemporal_load_0(
 ;
 ; GFX1250-LABEL: private_nontemporal_load_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -453,6 +454,7 @@ define amdgpu_kernel void @private_nontemporal_load_1(
 ;
 ; GFX1250-LABEL: private_nontemporal_load_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -644,6 +646,7 @@ define amdgpu_kernel void @private_nontemporal_store_0(
 ;
 ; GFX1250-LABEL: private_nontemporal_store_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -862,6 +865,7 @@ define amdgpu_kernel void @private_nontemporal_store_1(
 ;
 ; GFX1250-LABEL: private_nontemporal_store_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1077,6 +1081,7 @@ define amdgpu_kernel void @private_nontemporal_volatile_load(
 ;
 ; GFX1250-LABEL: private_nontemporal_volatile_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
index f9189c8419898..583eaeb38870b 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-singlethread.ll
@@ -179,6 +179,7 @@ define amdgpu_kernel void @private_singlethread_unordered_load(
 ;
 ; GFX1250-LABEL: private_singlethread_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -378,6 +379,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_load(
 ;
 ; GFX1250-LABEL: private_singlethread_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -577,6 +579,7 @@ define amdgpu_kernel void @private_singlethread_acquire_load(
 ;
 ; GFX1250-LABEL: private_singlethread_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -776,6 +779,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_singlethread_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -955,6 +959,7 @@ define amdgpu_kernel void @private_singlethread_unordered_store(
 ;
 ; GFX1250-LABEL: private_singlethread_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1133,6 +1138,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_store(
 ;
 ; GFX1250-LABEL: private_singlethread_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1311,6 +1317,7 @@ define amdgpu_kernel void @private_singlethread_release_store(
 ;
 ; GFX1250-LABEL: private_singlethread_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1489,6 +1496,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_singlethread_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1667,6 +1675,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1845,6 +1854,7 @@ define amdgpu_kernel void @private_singlethread_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2023,6 +2033,7 @@ define amdgpu_kernel void @private_singlethread_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2201,6 +2212,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2379,6 +2391,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2611,6 +2624,7 @@ define amdgpu_kernel void @private_singlethread_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -2846,6 +2860,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -3081,6 +3096,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -3346,6 +3362,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3615,6 +3632,7 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3884,6 +3902,7 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4153,6 +4172,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4422,6 +4442,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4691,6 +4712,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4960,6 +4982,7 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5229,6 +5252,7 @@ define amdgpu_kernel void @private_singlethread_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5498,6 +5522,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5767,6 +5792,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6036,6 +6062,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6305,6 +6332,7 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6574,6 +6602,7 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6843,6 +6872,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -7112,6 +7142,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -7409,6 +7440,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7710,6 +7742,7 @@ define amdgpu_kernel void @private_singlethread_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8011,6 +8044,7 @@ define amdgpu_kernel void @private_singlethread_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8312,6 +8346,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8613,6 +8648,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8914,6 +8950,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9215,6 +9252,7 @@ define amdgpu_kernel void @private_singlethread_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9516,6 +9554,7 @@ define amdgpu_kernel void @private_singlethread_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9817,6 +9856,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10118,6 +10158,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10419,6 +10460,7 @@ define amdgpu_kernel void @private_singlethread_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10720,6 +10762,7 @@ define amdgpu_kernel void @private_singlethread_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11021,6 +11064,7 @@ define amdgpu_kernel void @private_singlethread_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11322,6 +11366,7 @@ define amdgpu_kernel void @private_singlethread_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11623,6 +11668,7 @@ define amdgpu_kernel void @private_singlethread_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11832,6 +11878,7 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12031,6 +12078,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12230,6 +12278,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12429,6 +12478,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12608,6 +12658,7 @@ define amdgpu_kernel void @private_singlethread_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12786,6 +12837,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12964,6 +13016,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_store(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13142,6 +13195,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13320,6 +13374,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13498,6 +13553,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13676,6 +13732,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13854,6 +13911,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -14032,6 +14090,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -14264,6 +14323,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -14499,6 +14559,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -14734,6 +14795,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -14999,6 +15061,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_cmpxc
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15268,6 +15331,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_cmpxchg
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15537,6 +15601,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_cmpxchg
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15806,6 +15871,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_cmpxchg
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16075,6 +16141,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_cmpxchg
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16344,6 +16411,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_cmpxchg
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16613,6 +16681,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16882,6 +16951,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17151,6 +17221,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17420,6 +17491,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17689,6 +17761,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_cmpxchg
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17958,6 +18031,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18227,6 +18301,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18496,6 +18571,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18765,6 +18841,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -19062,6 +19139,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_monotonic_ret_c
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -19363,6 +19441,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_monotonic_ret_cmp
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -19664,6 +19743,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_monotonic_ret_cmp
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -19965,6 +20045,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_monotonic_ret_cmp
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20266,6 +20347,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_monotonic_ret_cmp
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20567,6 +20649,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_acquire_ret_cmp
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20868,6 +20951,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_acquire_ret_cmpxc
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21169,6 +21253,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_acquire_ret_cmpxc
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21470,6 +21555,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_acquire_ret_cmpxc
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21771,6 +21857,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_acquire_ret_cmpxc
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22072,6 +22159,7 @@ define amdgpu_kernel void @private_singlethread_one_as_monotonic_seq_cst_ret_cmp
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22373,6 +22461,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acquire_seq_cst_ret_cmpxc
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22674,6 +22763,7 @@ define amdgpu_kernel void @private_singlethread_one_as_release_seq_cst_ret_cmpxc
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22975,6 +23065,7 @@ define amdgpu_kernel void @private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxc
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -23276,6 +23367,7 @@ define amdgpu_kernel void @private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxc
 ;
 ; GFX1250-LABEL: private_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
index e4708f544d721..1bf9db321fa0c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-system.ll
@@ -179,6 +179,7 @@ define amdgpu_kernel void @private_system_unordered_load(
 ;
 ; GFX1250-LABEL: private_system_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -378,6 +379,7 @@ define amdgpu_kernel void @private_system_monotonic_load(
 ;
 ; GFX1250-LABEL: private_system_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -577,6 +579,7 @@ define amdgpu_kernel void @private_system_acquire_load(
 ;
 ; GFX1250-LABEL: private_system_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -777,6 +780,7 @@ define amdgpu_kernel void @private_system_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_system_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -959,6 +963,7 @@ define amdgpu_kernel void @private_system_unordered_store(
 ;
 ; GFX1250-LABEL: private_system_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1137,6 +1142,7 @@ define amdgpu_kernel void @private_system_monotonic_store(
 ;
 ; GFX1250-LABEL: private_system_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1315,6 +1321,7 @@ define amdgpu_kernel void @private_system_release_store(
 ;
 ; GFX1250-LABEL: private_system_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1496,6 +1503,7 @@ define amdgpu_kernel void @private_system_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_system_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1677,6 +1685,7 @@ define amdgpu_kernel void @private_system_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1855,6 +1864,7 @@ define amdgpu_kernel void @private_system_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2035,6 +2045,7 @@ define amdgpu_kernel void @private_system_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2216,6 +2227,7 @@ define amdgpu_kernel void @private_system_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2399,6 +2411,7 @@ define amdgpu_kernel void @private_system_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2636,6 +2649,7 @@ define amdgpu_kernel void @private_system_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -2872,6 +2886,7 @@ define amdgpu_kernel void @private_system_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -3111,6 +3126,7 @@ define amdgpu_kernel void @private_system_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -3380,6 +3396,7 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3649,6 +3666,7 @@ define amdgpu_kernel void @private_system_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3920,6 +3938,7 @@ define amdgpu_kernel void @private_system_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4192,6 +4211,7 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4466,6 +4486,7 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4740,6 +4761,7 @@ define amdgpu_kernel void @private_system_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5011,6 +5033,7 @@ define amdgpu_kernel void @private_system_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5282,6 +5305,7 @@ define amdgpu_kernel void @private_system_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5556,6 +5580,7 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5830,6 +5855,7 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6104,6 +6130,7 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6406,6 +6433,7 @@ define amdgpu_kernel void @private_system_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -6707,6 +6735,7 @@ define amdgpu_kernel void @private_system_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7009,6 +7038,7 @@ define amdgpu_kernel void @private_system_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7314,6 +7344,7 @@ define amdgpu_kernel void @private_system_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7619,6 +7650,7 @@ define amdgpu_kernel void @private_system_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7921,6 +7953,7 @@ define amdgpu_kernel void @private_system_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8223,6 +8256,7 @@ define amdgpu_kernel void @private_system_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8528,6 +8562,7 @@ define amdgpu_kernel void @private_system_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8833,6 +8868,7 @@ define amdgpu_kernel void @private_system_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9138,6 +9174,7 @@ define amdgpu_kernel void @private_system_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9443,6 +9480,7 @@ define amdgpu_kernel void @private_system_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9748,6 +9786,7 @@ define amdgpu_kernel void @private_system_relese_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_relese_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10053,6 +10092,7 @@ define amdgpu_kernel void @private_system_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10358,6 +10398,7 @@ define amdgpu_kernel void @private_system_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10571,6 +10612,7 @@ define amdgpu_kernel void @private_system_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: private_system_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -10770,6 +10812,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: private_system_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -10969,6 +11012,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: private_system_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -11170,6 +11214,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_system_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -11353,6 +11398,7 @@ define amdgpu_kernel void @private_system_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: private_system_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -11531,6 +11577,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: private_system_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -11709,6 +11756,7 @@ define amdgpu_kernel void @private_system_one_as_release_store(
 ;
 ; GFX1250-LABEL: private_system_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -11890,6 +11938,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_system_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12071,6 +12120,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12249,6 +12299,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12429,6 +12480,7 @@ define amdgpu_kernel void @private_system_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12610,6 +12662,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12793,6 +12846,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13030,6 +13084,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -13267,6 +13322,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -13507,6 +13563,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_system_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -13777,6 +13834,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14046,6 +14104,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14317,6 +14376,7 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14589,6 +14649,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -14863,6 +14924,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15137,6 +15199,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15408,6 +15471,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15679,6 +15743,7 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15953,6 +16018,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16227,6 +16293,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16501,6 +16568,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16775,6 +16843,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17049,6 +17118,7 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17323,6 +17393,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17597,6 +17668,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17899,6 +17971,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_monotonic_ret_cmpxchg
 ;
 ; GFX1250-LABEL: private_system_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -18200,6 +18273,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -18503,6 +18577,7 @@ define amdgpu_kernel void @private_system_one_as_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -18807,6 +18882,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -19113,6 +19189,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -19419,6 +19496,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -19722,6 +19800,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20025,6 +20104,7 @@ define amdgpu_kernel void @private_system_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20331,6 +20411,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20637,6 +20718,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20943,6 +21025,7 @@ define amdgpu_kernel void @private_system_one_as_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21249,6 +21332,7 @@ define amdgpu_kernel void @private_system_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21555,6 +21639,7 @@ define amdgpu_kernel void @private_system_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21861,6 +21946,7 @@ define amdgpu_kernel void @private_system_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22167,6 +22253,7 @@ define amdgpu_kernel void @private_system_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_system_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index 7c23b76cec3e9..8d717c58c4a7c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -159,6 +159,7 @@ define amdgpu_kernel void @private_volatile_load_0(
 ;
 ; GFX1250-LABEL: private_volatile_load_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
@@ -351,6 +352,7 @@ define amdgpu_kernel void @private_volatile_load_1(
 ;
 ; GFX1250-LABEL: private_volatile_load_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, v0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x8
@@ -515,6 +517,7 @@ define amdgpu_kernel void @private_volatile_store_0(
 ;
 ; GFX1250-LABEL: private_volatile_store_0:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -697,6 +700,7 @@ define amdgpu_kernel void @private_volatile_store_1(
 ;
 ; GFX1250-LABEL: private_volatile_store_1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
index d4c562a149b9a..a9c163a87df61 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-wavefront.ll
@@ -179,6 +179,7 @@ define amdgpu_kernel void @private_wavefront_unordered_load(
 ;
 ; GFX1250-LABEL: private_wavefront_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -378,6 +379,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_load(
 ;
 ; GFX1250-LABEL: private_wavefront_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -577,6 +579,7 @@ define amdgpu_kernel void @private_wavefront_acquire_load(
 ;
 ; GFX1250-LABEL: private_wavefront_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -776,6 +779,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_wavefront_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -955,6 +959,7 @@ define amdgpu_kernel void @private_wavefront_unordered_store(
 ;
 ; GFX1250-LABEL: private_wavefront_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1133,6 +1138,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_store(
 ;
 ; GFX1250-LABEL: private_wavefront_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1311,6 +1317,7 @@ define amdgpu_kernel void @private_wavefront_release_store(
 ;
 ; GFX1250-LABEL: private_wavefront_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1489,6 +1496,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_wavefront_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1667,6 +1675,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1845,6 +1854,7 @@ define amdgpu_kernel void @private_wavefront_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2023,6 +2033,7 @@ define amdgpu_kernel void @private_wavefront_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2201,6 +2212,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2379,6 +2391,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2611,6 +2624,7 @@ define amdgpu_kernel void @private_wavefront_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -2846,6 +2860,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -3081,6 +3096,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -3346,6 +3362,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3615,6 +3632,7 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3884,6 +3902,7 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4153,6 +4172,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4422,6 +4442,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4691,6 +4712,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4960,6 +4982,7 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5229,6 +5252,7 @@ define amdgpu_kernel void @private_wavefront_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5498,6 +5522,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5767,6 +5792,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6036,6 +6062,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6305,6 +6332,7 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6574,6 +6602,7 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6843,6 +6872,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -7112,6 +7142,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -7409,6 +7440,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7710,6 +7742,7 @@ define amdgpu_kernel void @private_wavefront_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8011,6 +8044,7 @@ define amdgpu_kernel void @private_wavefront_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8312,6 +8346,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8613,6 +8648,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8914,6 +8950,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9215,6 +9252,7 @@ define amdgpu_kernel void @private_wavefront_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9516,6 +9554,7 @@ define amdgpu_kernel void @private_wavefront_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9817,6 +9856,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10118,6 +10158,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10419,6 +10460,7 @@ define amdgpu_kernel void @private_wavefront_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10720,6 +10762,7 @@ define amdgpu_kernel void @private_wavefront_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11021,6 +11064,7 @@ define amdgpu_kernel void @private_wavefront_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11322,6 +11366,7 @@ define amdgpu_kernel void @private_wavefront_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11623,6 +11668,7 @@ define amdgpu_kernel void @private_wavefront_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11832,6 +11878,7 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12031,6 +12078,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12230,6 +12278,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12429,6 +12478,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12608,6 +12658,7 @@ define amdgpu_kernel void @private_wavefront_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12786,6 +12837,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12964,6 +13016,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_store(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13142,6 +13195,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13320,6 +13374,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13498,6 +13553,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13676,6 +13732,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13854,6 +13911,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -14032,6 +14090,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -14264,6 +14323,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -14499,6 +14559,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -14734,6 +14795,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -14999,6 +15061,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15268,6 +15331,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15537,6 +15601,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15806,6 +15871,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16075,6 +16141,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16344,6 +16411,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16613,6 +16681,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16882,6 +16951,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17151,6 +17221,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17420,6 +17491,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17689,6 +17761,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17958,6 +18031,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18227,6 +18301,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18496,6 +18571,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18765,6 +18841,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -19062,6 +19139,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -19363,6 +19441,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -19664,6 +19743,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -19965,6 +20045,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20266,6 +20347,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20567,6 +20649,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20868,6 +20951,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21169,6 +21253,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21470,6 +21555,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21771,6 +21857,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22072,6 +22159,7 @@ define amdgpu_kernel void @private_wavefront_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22373,6 +22461,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22674,6 +22763,7 @@ define amdgpu_kernel void @private_wavefront_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22975,6 +23065,7 @@ define amdgpu_kernel void @private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -23276,6 +23367,7 @@ define amdgpu_kernel void @private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
index 53a8a0a1b694e..7561a60518bf8 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-workgroup.ll
@@ -179,6 +179,7 @@ define amdgpu_kernel void @private_workgroup_unordered_load(
 ;
 ; GFX1250-LABEL: private_workgroup_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -378,6 +379,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_load(
 ;
 ; GFX1250-LABEL: private_workgroup_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -577,6 +579,7 @@ define amdgpu_kernel void @private_workgroup_acquire_load(
 ;
 ; GFX1250-LABEL: private_workgroup_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -776,6 +779,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_workgroup_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -957,6 +961,7 @@ define amdgpu_kernel void @private_workgroup_unordered_store(
 ;
 ; GFX1250-LABEL: private_workgroup_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1135,6 +1140,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_store(
 ;
 ; GFX1250-LABEL: private_workgroup_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1313,6 +1319,7 @@ define amdgpu_kernel void @private_workgroup_release_store(
 ;
 ; GFX1250-LABEL: private_workgroup_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1493,6 +1500,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_workgroup_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1673,6 +1681,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -1851,6 +1860,7 @@ define amdgpu_kernel void @private_workgroup_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2030,6 +2040,7 @@ define amdgpu_kernel void @private_workgroup_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2210,6 +2221,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2391,6 +2403,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -2626,6 +2639,7 @@ define amdgpu_kernel void @private_workgroup_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -2861,6 +2875,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -3098,6 +3113,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -3365,6 +3381,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3634,6 +3651,7 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -3904,6 +3922,7 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4175,6 +4194,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4447,6 +4467,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4719,6 +4740,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -4989,6 +5011,7 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5259,6 +5282,7 @@ define amdgpu_kernel void @private_workgroup_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5531,6 +5555,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -5803,6 +5828,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6075,6 +6101,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6347,6 +6374,7 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6619,6 +6647,7 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -6891,6 +6920,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -7163,6 +7193,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -7463,6 +7494,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -7764,6 +7796,7 @@ define amdgpu_kernel void @private_workgroup_acquire_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8065,6 +8098,7 @@ define amdgpu_kernel void @private_workgroup_release_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8368,6 +8402,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8671,6 +8706,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_monotonic_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -8974,6 +9010,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9275,6 +9312,7 @@ define amdgpu_kernel void @private_workgroup_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9576,6 +9614,7 @@ define amdgpu_kernel void @private_workgroup_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -9879,6 +9918,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10182,6 +10222,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10485,6 +10526,7 @@ define amdgpu_kernel void @private_workgroup_monotonic_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -10788,6 +10830,7 @@ define amdgpu_kernel void @private_workgroup_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11091,6 +11134,7 @@ define amdgpu_kernel void @private_workgroup_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11394,6 +11438,7 @@ define amdgpu_kernel void @private_workgroup_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11697,6 +11742,7 @@ define amdgpu_kernel void @private_workgroup_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -11908,6 +11954,7 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_load(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_unordered_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12107,6 +12154,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_load(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12306,6 +12354,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_load(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12505,6 +12554,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_load(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_load:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12686,6 +12736,7 @@ define amdgpu_kernel void @private_workgroup_one_as_unordered_store(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_unordered_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -12864,6 +12915,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_store(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13042,6 +13094,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_store(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_release_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13222,6 +13275,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_store(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_store:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13402,6 +13456,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13580,6 +13635,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13759,6 +13815,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_release_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -13939,6 +13996,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -14120,6 +14178,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s1, 0
@@ -14355,6 +14414,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -14590,6 +14650,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -14827,6 +14888,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_ret_atomicrmw(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_ret_atomicrmw:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_mov_b32 s2, 0
@@ -15094,6 +15156,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15363,6 +15426,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15633,6 +15697,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_release_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -15904,6 +15969,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16176,6 +16242,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_monotonic_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16448,6 +16515,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16718,6 +16786,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -16988,6 +17057,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_release_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17260,6 +17330,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17532,6 +17603,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_acquire_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -17804,6 +17876,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18076,6 +18149,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18348,6 +18422,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_release_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18620,6 +18695,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -18892,6 +18968,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_seq_cst_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x8
@@ -19192,6 +19269,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_monotonic_ret_cmpx
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -19493,6 +19571,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_monotonic_ret_cmpxch
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -19794,6 +19873,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_monotonic_ret_cmpxch
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_release_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20097,6 +20177,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_monotonic_ret_cmpxch
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20400,6 +20481,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_monotonic_ret_cmpxch
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -20703,6 +20785,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_acquire_ret_cmpxch
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21004,6 +21087,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21305,6 +21389,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_release_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21608,6 +21693,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -21911,6 +21997,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_acquire_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22214,6 +22301,7 @@ define amdgpu_kernel void @private_workgroup_one_as_monotonic_seq_cst_ret_cmpxch
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22517,6 +22605,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acquire_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -22820,6 +22909,7 @@ define amdgpu_kernel void @private_workgroup_one_as_release_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_release_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -23123,6 +23213,7 @@ define amdgpu_kernel void @private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8
@@ -23426,6 +23517,7 @@ define amdgpu_kernel void @private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg(
 ;
 ; GFX1250-LABEL: private_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4
 ; GFX1250-NEXT:    s_load_b32 s1, s[4:5], 0x8

diff  --git a/llvm/test/CodeGen/AMDGPU/min.ll b/llvm/test/CodeGen/AMDGPU/min.ll
index 0458a64991028..eff0680fe9a31 100644
--- a/llvm/test/CodeGen/AMDGPU/min.ll
+++ b/llvm/test/CodeGen/AMDGPU/min.ll
@@ -129,6 +129,7 @@ define amdgpu_kernel void @v_test_imin_sle_i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: v_test_imin_sle_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
@@ -226,6 +227,7 @@ define amdgpu_kernel void @s_test_imin_sle_i32(ptr addrspace(1) %out, i32 %a, i3
 ;
 ; GFX1250-LABEL: s_test_imin_sle_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -313,6 +315,7 @@ define amdgpu_kernel void @s_test_imin_sle_v1i32(ptr addrspace(1) %out, <1 x i32
 ;
 ; GFX1250-LABEL: s_test_imin_sle_v1i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -439,6 +442,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i32(ptr addrspace(1) %out, <4 x i32
 ;
 ; GFX1250-LABEL: s_test_imin_sle_v4i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x10
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -574,6 +578,7 @@ define amdgpu_kernel void @s_test_imin_sle_i8(ptr addrspace(1) %out, [8 x i32],
 ;
 ; GFX1250-LABEL: s_test_imin_sle_i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x2
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4c
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x28
@@ -829,6 +834,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i8(ptr addrspace(1) %out, [8 x i32]
 ;
 ; GFX1250-LABEL: s_test_imin_sle_v4i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x2
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x28
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x4c
@@ -974,6 +980,7 @@ define amdgpu_kernel void @s_test_imin_sle_v2i16(ptr addrspace(1) %out, <2 x i16
 ;
 ; GFX1250-LABEL: s_test_imin_sle_v2i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1170,6 +1177,7 @@ define amdgpu_kernel void @s_test_imin_sle_v4i16(ptr addrspace(1) %out, <4 x i16
 ;
 ; GFX1250-LABEL: s_test_imin_sle_v4i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x0
@@ -1305,6 +1313,7 @@ define amdgpu_kernel void @v_test_imin_slt_i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: v_test_imin_slt_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
@@ -1482,6 +1491,7 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-TRUE16-LABEL: v_test_imin_slt_i16:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    s_clause 0x1
 ; GFX1250-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-TRUE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
@@ -1497,6 +1507,7 @@ define amdgpu_kernel void @v_test_imin_slt_i16(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-FAKE16-LABEL: v_test_imin_slt_i16:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    s_clause 0x1
 ; GFX1250-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-FAKE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
@@ -1595,6 +1606,7 @@ define amdgpu_kernel void @s_test_imin_slt_i32(ptr addrspace(1) %out, i32 %a, i3
 ;
 ; GFX1250-LABEL: s_test_imin_slt_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1699,6 +1711,7 @@ define amdgpu_kernel void @s_test_imin_slt_v2i32(ptr addrspace(1) %out, <2 x i32
 ;
 ; GFX1250-LABEL: s_test_imin_slt_v2i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x8
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x0
@@ -1796,6 +1809,7 @@ define amdgpu_kernel void @s_test_imin_slt_imm_i32(ptr addrspace(1) %out, i32 %a
 ;
 ; GFX1250-LABEL: s_test_imin_slt_imm_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1890,6 +1904,7 @@ define amdgpu_kernel void @s_test_imin_sle_imm_i32(ptr addrspace(1) %out, i32 %a
 ;
 ; GFX1250-LABEL: s_test_imin_sle_imm_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2024,6 +2039,7 @@ define amdgpu_kernel void @v_test_umin_ule_i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: v_test_umin_ule_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
@@ -2184,6 +2200,7 @@ define amdgpu_kernel void @v_test_umin_ule_v3i32(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX1250-LABEL: v_test_umin_ule_v3i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
@@ -2387,6 +2404,7 @@ define amdgpu_kernel void @v_test_umin_ule_v3i16(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX1250-LABEL: v_test_umin_ule_v3i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
@@ -2490,6 +2508,7 @@ define amdgpu_kernel void @s_test_umin_ule_i32(ptr addrspace(1) %out, i32 %a, i3
 ;
 ; GFX1250-LABEL: s_test_umin_ule_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2624,6 +2643,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-LABEL: v_test_umin_ult_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
@@ -2784,6 +2804,7 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX1250-TRUE16-LABEL: v_test_umin_ult_i8:
 ; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-TRUE16-NEXT:    s_clause 0x1
 ; GFX1250-TRUE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-TRUE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
@@ -2799,6 +2820,7 @@ define amdgpu_kernel void @v_test_umin_ult_i8(ptr addrspace(1) %out, ptr addrspa
 ;
 ; GFX1250-FAKE16-LABEL: v_test_umin_ult_i8:
 ; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-FAKE16-NEXT:    s_clause 0x1
 ; GFX1250-FAKE16-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-FAKE16-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
@@ -2897,6 +2919,7 @@ define amdgpu_kernel void @s_test_umin_ult_i32(ptr addrspace(1) %out, i32 %a, i3
 ;
 ; GFX1250-LABEL: s_test_umin_ult_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -3051,6 +3074,7 @@ define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(ptr addrspace(1) %out0,
 ;
 ; GFX1250-LABEL: v_test_umin_ult_i32_multi_use:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -3247,6 +3271,7 @@ define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(ptr addrspace(1) %out0,
 ;
 ; GFX1250-LABEL: v_test_umin_ult_i16_multi_use:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -3347,6 +3372,7 @@ define amdgpu_kernel void @s_test_umin_ult_v1i32(ptr addrspace(1) %out, <1 x i32
 ;
 ; GFX1250-LABEL: s_test_umin_ult_v1i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -3536,6 +3562,7 @@ define amdgpu_kernel void @s_test_umin_ult_v8i32(ptr addrspace(1) %out, <8 x i32
 ;
 ; GFX1250-LABEL: s_test_umin_ult_v8i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b512 s[8:23], s[4:5], 0x20
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -3852,6 +3879,7 @@ define amdgpu_kernel void @s_test_umin_ult_v8i16(ptr addrspace(1) %out, <8 x i16
 ;
 ; GFX1250-LABEL: s_test_umin_ult_v8i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x10
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
@@ -3976,6 +4004,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(ptr addrspac
 ;
 ; GFX1250-LABEL: simplify_demanded_bits_test_umin_ult_i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x2
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x28
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x4c
@@ -4105,6 +4134,7 @@ define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(ptr addrspace
 ;
 ; GFX1250-LABEL: simplify_demanded_bits_test_min_slt_i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x2
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x28
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x4c
@@ -4237,6 +4267,7 @@ define amdgpu_kernel void @s_test_imin_sle_i16(ptr addrspace(1) %out, i16 %a, i1
 ;
 ; GFX1250-LABEL: s_test_imin_sle_i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x0
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -4365,6 +4396,7 @@ define amdgpu_kernel void @test_umin_ult_i64(ptr addrspace(1) %out, i64 %a, i64
 ;
 ; GFX1250-LABEL: test_umin_ult_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
@@ -4489,6 +4521,7 @@ define amdgpu_kernel void @test_umin_ule_i64(ptr addrspace(1) %out, i64 %a, i64
 ;
 ; GFX1250-LABEL: test_umin_ule_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
@@ -4613,6 +4646,7 @@ define amdgpu_kernel void @test_imin_slt_i64(ptr addrspace(1) %out, i64 %a, i64
 ;
 ; GFX1250-LABEL: test_imin_slt_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
@@ -4737,6 +4771,7 @@ define amdgpu_kernel void @test_imin_sle_i64(ptr addrspace(1) %out, i64 %a, i64
 ;
 ; GFX1250-LABEL: test_imin_sle_i64:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
@@ -4899,6 +4934,7 @@ define amdgpu_kernel void @v_test_imin_sle_v2i16(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX1250-LABEL: v_test_imin_sle_v2i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
@@ -5069,6 +5105,7 @@ define amdgpu_kernel void @v_test_imin_ule_v2i16(ptr addrspace(1) %out, ptr addr
 ;
 ; GFX1250-LABEL: v_test_imin_ule_v2i16:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10

diff  --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index 60570bdbfc23b..dacee9a0173c5 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -41,25 +41,67 @@ define i32 @test_minmax_i32(i32 %a, i32 %b, i32 %c) {
 }
 
 define amdgpu_ps void @s_test_minmax_i32(i32 inreg %a, i32 inreg %b, i32 inreg %c, ptr addrspace(1) inreg %out) {
-; SDAG-LABEL: s_test_minmax_i32:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_max_i32 s0, s0, s1
-; SDAG-NEXT:    s_mov_b32 s5, s4
-; SDAG-NEXT:    s_min_i32 s0, s0, s2
-; SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; SDAG-NEXT:    s_mov_b32 s4, s3
-; SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
-; SDAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: s_test_minmax_i32:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_max_i32 s0, s0, s1
-; GISEL-NEXT:    s_mov_b32 s6, s3
-; GISEL-NEXT:    s_min_i32 s0, s0, s2
-; GISEL-NEXT:    s_mov_b32 s7, s4
-; GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
-; GISEL-NEXT:    global_store_b32 v1, v0, s[6:7]
-; GISEL-NEXT:    s_endpgm
+; SDAG-GFX11-LABEL: s_test_minmax_i32:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    s_max_i32 s0, s0, s1
+; SDAG-GFX11-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX11-NEXT:    s_min_i32 s0, s0, s2
+; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; SDAG-GFX11-NEXT:    s_endpgm
+;
+; GISEL-GFX11-LABEL: s_test_minmax_i32:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_max_i32 s0, s0, s1
+; GISEL-GFX11-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX11-NEXT:    s_min_i32 s0, s0, s2
+; GISEL-GFX11-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GISEL-GFX11-NEXT:    s_endpgm
+;
+; SDAG-GFX12-LABEL: s_test_minmax_i32:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    s_max_i32 s0, s0, s1
+; SDAG-GFX12-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX12-NEXT:    s_min_i32 s0, s0, s2
+; SDAG-GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX12-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX12-NEXT:    global_store_b32 v0, v1, s[4:5]
+; SDAG-GFX12-NEXT:    s_endpgm
+;
+; GISEL-GFX12-LABEL: s_test_minmax_i32:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_max_i32 s0, s0, s1
+; GISEL-GFX12-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX12-NEXT:    s_min_i32 s0, s0, s2
+; GISEL-GFX12-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GISEL-GFX12-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GISEL-GFX12-NEXT:    s_endpgm
+;
+; SDAG-GFX1250-LABEL: s_test_minmax_i32:
+; SDAG-GFX1250:       ; %bb.0:
+; SDAG-GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; SDAG-GFX1250-NEXT:    s_max_i32 s0, s0, s1
+; SDAG-GFX1250-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX1250-NEXT:    s_min_i32 s0, s0, s2
+; SDAG-GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX1250-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX1250-NEXT:    global_store_b32 v0, v1, s[4:5]
+; SDAG-GFX1250-NEXT:    s_endpgm
+;
+; GISEL-GFX1250-LABEL: s_test_minmax_i32:
+; GISEL-GFX1250:       ; %bb.0:
+; GISEL-GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GISEL-GFX1250-NEXT:    s_max_i32 s0, s0, s1
+; GISEL-GFX1250-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX1250-NEXT:    s_min_i32 s0, s0, s2
+; GISEL-GFX1250-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX1250-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GISEL-GFX1250-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GISEL-GFX1250-NEXT:    s_endpgm
   %smax = call i32 @llvm.smax.i32(i32 %a, i32 %b)
   %sminmax = call i32 @llvm.smin.i32(i32 %smax, i32 %c)
   store i32 %sminmax, ptr addrspace(1) %out
@@ -213,25 +255,67 @@ define i32 @test_minmax_u32(i32 %a, i32 %b, i32 %c) {
 }
 
 define amdgpu_ps void @s_test_minmax_u32(i32 inreg %a, i32 inreg %b, i32 inreg %c, ptr addrspace(1) inreg %out) {
-; SDAG-LABEL: s_test_minmax_u32:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    s_max_u32 s0, s0, s1
-; SDAG-NEXT:    s_mov_b32 s5, s4
-; SDAG-NEXT:    s_min_u32 s0, s0, s2
-; SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
-; SDAG-NEXT:    s_mov_b32 s4, s3
-; SDAG-NEXT:    global_store_b32 v0, v1, s[4:5]
-; SDAG-NEXT:    s_endpgm
-;
-; GISEL-LABEL: s_test_minmax_u32:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_max_u32 s0, s0, s1
-; GISEL-NEXT:    s_mov_b32 s6, s3
-; GISEL-NEXT:    s_min_u32 s0, s0, s2
-; GISEL-NEXT:    s_mov_b32 s7, s4
-; GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
-; GISEL-NEXT:    global_store_b32 v1, v0, s[6:7]
-; GISEL-NEXT:    s_endpgm
+; SDAG-GFX11-LABEL: s_test_minmax_u32:
+; SDAG-GFX11:       ; %bb.0:
+; SDAG-GFX11-NEXT:    s_max_u32 s0, s0, s1
+; SDAG-GFX11-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX11-NEXT:    s_min_u32 s0, s0, s2
+; SDAG-GFX11-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX11-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX11-NEXT:    global_store_b32 v0, v1, s[4:5]
+; SDAG-GFX11-NEXT:    s_endpgm
+;
+; GISEL-GFX11-LABEL: s_test_minmax_u32:
+; GISEL-GFX11:       ; %bb.0:
+; GISEL-GFX11-NEXT:    s_max_u32 s0, s0, s1
+; GISEL-GFX11-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX11-NEXT:    s_min_u32 s0, s0, s2
+; GISEL-GFX11-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GISEL-GFX11-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GISEL-GFX11-NEXT:    s_endpgm
+;
+; SDAG-GFX12-LABEL: s_test_minmax_u32:
+; SDAG-GFX12:       ; %bb.0:
+; SDAG-GFX12-NEXT:    s_max_u32 s0, s0, s1
+; SDAG-GFX12-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX12-NEXT:    s_min_u32 s0, s0, s2
+; SDAG-GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX12-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX12-NEXT:    global_store_b32 v0, v1, s[4:5]
+; SDAG-GFX12-NEXT:    s_endpgm
+;
+; GISEL-GFX12-LABEL: s_test_minmax_u32:
+; GISEL-GFX12:       ; %bb.0:
+; GISEL-GFX12-NEXT:    s_max_u32 s0, s0, s1
+; GISEL-GFX12-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX12-NEXT:    s_min_u32 s0, s0, s2
+; GISEL-GFX12-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GISEL-GFX12-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GISEL-GFX12-NEXT:    s_endpgm
+;
+; SDAG-GFX1250-LABEL: s_test_minmax_u32:
+; SDAG-GFX1250:       ; %bb.0:
+; SDAG-GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; SDAG-GFX1250-NEXT:    s_max_u32 s0, s0, s1
+; SDAG-GFX1250-NEXT:    s_mov_b32 s5, s4
+; SDAG-GFX1250-NEXT:    s_min_u32 s0, s0, s2
+; SDAG-GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
+; SDAG-GFX1250-NEXT:    s_mov_b32 s4, s3
+; SDAG-GFX1250-NEXT:    global_store_b32 v0, v1, s[4:5]
+; SDAG-GFX1250-NEXT:    s_endpgm
+;
+; GISEL-GFX1250-LABEL: s_test_minmax_u32:
+; GISEL-GFX1250:       ; %bb.0:
+; GISEL-GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GISEL-GFX1250-NEXT:    s_max_u32 s0, s0, s1
+; GISEL-GFX1250-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX1250-NEXT:    s_min_u32 s0, s0, s2
+; GISEL-GFX1250-NEXT:    s_mov_b32 s7, s4
+; GISEL-GFX1250-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, 0
+; GISEL-GFX1250-NEXT:    global_store_b32 v1, v0, s[6:7]
+; GISEL-GFX1250-NEXT:    s_endpgm
   %smax = call i32 @llvm.umax.i32(i32 %a, i32 %b)
   %sminmax = call i32 @llvm.umin.i32(i32 %smax, i32 %c)
   store i32 %sminmax, ptr addrspace(1) %out
@@ -460,6 +544,7 @@ define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg
 ;
 ; SDAG-GFX1250-LABEL: s_test_minmax_f32_ieee_false:
 ; SDAG-GFX1250:       ; %bb.0:
+; SDAG-GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-GFX1250-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
 ; SDAG-GFX1250-NEXT:    s_mov_b32 s5, s4
 ; SDAG-GFX1250-NEXT:    s_mov_b32 s4, s3
@@ -469,6 +554,7 @@ define amdgpu_ps void @s_test_minmax_f32_ieee_false(float inreg %a, float inreg
 ;
 ; GISEL-GFX1250-LABEL: s_test_minmax_f32_ieee_false:
 ; GISEL-GFX1250:       ; %bb.0:
+; GISEL-GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-GFX1250-NEXT:    s_max_num_f32 s0, s0, s1
 ; GISEL-GFX1250-NEXT:    s_mov_b32 s6, s3
 ; GISEL-GFX1250-NEXT:    s_mov_b32 s7, s4
@@ -496,6 +582,7 @@ define amdgpu_ps float @test_minmax_commuted_f32_ieee_false(float %a, float %b,
 ;
 ; GFX1250-LABEL: test_minmax_commuted_f32_ieee_false:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_maxmin_num_f32 v0, v0, v1, v2
 ; GFX1250-NEXT:    ; return to shader part epilog
   %max = call float @llvm.maxnum.f32(float %a, float %b)
@@ -579,6 +666,7 @@ define amdgpu_ps float @test_maxmin_commuted_f32_ieee_false(float %a, float %b,
 ;
 ; GFX1250-LABEL: test_maxmin_commuted_f32_ieee_false:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_minmax_num_f32 v0, v0, v1, v2
 ; GFX1250-NEXT:    ; return to shader part epilog
   %min = call float @llvm.minnum.f32(float %a, float %b)
@@ -697,21 +785,25 @@ define amdgpu_ps half @test_minmax_f16_ieee_false(half %a, half %b, half %c) {
 ;
 ; SDAG-GFX1250-TRUE16-LABEL: test_minmax_f16_ieee_false:
 ; SDAG-GFX1250-TRUE16:       ; %bb.0:
+; SDAG-GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-GFX1250-TRUE16-NEXT:    v_maxmin_num_f16 v0.l, v0.l, v1.l, v2.l
 ; SDAG-GFX1250-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; SDAG-GFX1250-FAKE16-LABEL: test_minmax_f16_ieee_false:
 ; SDAG-GFX1250-FAKE16:       ; %bb.0:
+; SDAG-GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-GFX1250-FAKE16-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
 ; SDAG-GFX1250-FAKE16-NEXT:    ; return to shader part epilog
 ;
 ; GISEL-GFX1250-TRUE16-LABEL: test_minmax_f16_ieee_false:
 ; GISEL-GFX1250-TRUE16:       ; %bb.0:
+; GISEL-GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-GFX1250-TRUE16-NEXT:    v_maxmin_num_f16 v0.l, v0.l, v1.l, v2.l
 ; GISEL-GFX1250-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GISEL-GFX1250-FAKE16-LABEL: test_minmax_f16_ieee_false:
 ; GISEL-GFX1250-FAKE16:       ; %bb.0:
+; GISEL-GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-GFX1250-FAKE16-NEXT:    v_maxmin_num_f16 v0, v0, v1, v2
 ; GISEL-GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %max = call half @llvm.maxnum.f16(half %a, half %b)
@@ -801,6 +893,7 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b
 ;
 ; SDAG-GFX1250-TRUE16-LABEL: s_test_minmax_f16_ieee_false:
 ; SDAG-GFX1250-TRUE16:       ; %bb.0:
+; SDAG-GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-GFX1250-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s2
 ; SDAG-GFX1250-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; SDAG-GFX1250-TRUE16-NEXT:    s_mov_b32 s5, s4
@@ -811,6 +904,7 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b
 ;
 ; SDAG-GFX1250-FAKE16-LABEL: s_test_minmax_f16_ieee_false:
 ; SDAG-GFX1250-FAKE16:       ; %bb.0:
+; SDAG-GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-GFX1250-FAKE16-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, 0
 ; SDAG-GFX1250-FAKE16-NEXT:    s_mov_b32 s5, s4
 ; SDAG-GFX1250-FAKE16-NEXT:    s_mov_b32 s4, s3
@@ -820,6 +914,7 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b
 ;
 ; GISEL-GFX1250-TRUE16-LABEL: s_test_minmax_f16_ieee_false:
 ; GISEL-GFX1250-TRUE16:       ; %bb.0:
+; GISEL-GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-GFX1250-TRUE16-NEXT:    s_max_num_f16 s0, s0, s1
 ; GISEL-GFX1250-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-GFX1250-TRUE16-NEXT:    s_mov_b32 s6, s3
@@ -831,6 +926,7 @@ define amdgpu_ps void @s_test_minmax_f16_ieee_false(half inreg %a, half inreg %b
 ;
 ; GISEL-GFX1250-FAKE16-LABEL: s_test_minmax_f16_ieee_false:
 ; GISEL-GFX1250-FAKE16:       ; %bb.0:
+; GISEL-GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-GFX1250-FAKE16-NEXT:    s_max_num_f16 s0, s0, s1
 ; GISEL-GFX1250-FAKE16-NEXT:    s_mov_b32 s6, s3
 ; GISEL-GFX1250-FAKE16-NEXT:    s_mov_b32 s7, s4
@@ -1021,21 +1117,25 @@ define amdgpu_ps half @test_maxmin_f16_ieee_false(half %a, half %b, half %c) {
 ;
 ; SDAG-GFX1250-TRUE16-LABEL: test_maxmin_f16_ieee_false:
 ; SDAG-GFX1250-TRUE16:       ; %bb.0:
+; SDAG-GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-GFX1250-TRUE16-NEXT:    v_minmax_num_f16 v0.l, v0.l, v1.l, v2.l
 ; SDAG-GFX1250-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; SDAG-GFX1250-FAKE16-LABEL: test_maxmin_f16_ieee_false:
 ; SDAG-GFX1250-FAKE16:       ; %bb.0:
+; SDAG-GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-GFX1250-FAKE16-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
 ; SDAG-GFX1250-FAKE16-NEXT:    ; return to shader part epilog
 ;
 ; GISEL-GFX1250-TRUE16-LABEL: test_maxmin_f16_ieee_false:
 ; GISEL-GFX1250-TRUE16:       ; %bb.0:
+; GISEL-GFX1250-TRUE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-GFX1250-TRUE16-NEXT:    v_minmax_num_f16 v0.l, v0.l, v1.l, v2.l
 ; GISEL-GFX1250-TRUE16-NEXT:    ; return to shader part epilog
 ;
 ; GISEL-GFX1250-FAKE16-LABEL: test_maxmin_f16_ieee_false:
 ; GISEL-GFX1250-FAKE16:       ; %bb.0:
+; GISEL-GFX1250-FAKE16-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-GFX1250-FAKE16-NEXT:    v_minmax_num_f16 v0, v0, v1, v2
 ; GISEL-GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %min = call half @llvm.minnum.f16(half %a, half %b)
@@ -1297,3 +1397,6 @@ declare half @llvm.maxnum.f16(half, half)
 declare float @llvm.minnum.f32(float, float)
 declare float @llvm.maxnum.f32(float, float)
 attributes #0 = { nounwind "no-nans-fp-math"="true" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GISEL: {{.*}}
+; SDAG: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/mul.ll b/llvm/test/CodeGen/AMDGPU/mul.ll
index 4681d589ac217..f1130100725c8 100644
--- a/llvm/test/CodeGen/AMDGPU/mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul.ll
@@ -127,6 +127,7 @@ define amdgpu_kernel void @test_mul_v2i32(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX1250-LABEL: test_mul_v2i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
@@ -308,6 +309,7 @@ define amdgpu_kernel void @v_mul_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %
 ;
 ; GFX1250-LABEL: v_mul_v4i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
@@ -447,6 +449,7 @@ define amdgpu_kernel void @s_trunc_i64_mul_to_i32(ptr addrspace(1) %out, i64 %a,
 ;
 ; GFX1250-LABEL: s_trunc_i64_mul_to_i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x34
@@ -614,6 +617,7 @@ define amdgpu_kernel void @v_trunc_i64_mul_to_i32(ptr addrspace(1) %out, ptr add
 ;
 ; GFX1250-LABEL: v_trunc_i64_mul_to_i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
@@ -752,6 +756,7 @@ define amdgpu_kernel void @mul64_sext_c(ptr addrspace(1) %out, i32 %in) {
 ;
 ; GFX1250-LABEL: mul64_sext_c:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_ashr_i32 s3, s2, 31
@@ -868,6 +873,7 @@ define amdgpu_kernel void @mul64_zext_c(ptr addrspace(1) %out, i32 %in) {
 ;
 ; GFX1250-LABEL: mul64_zext_c:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-NEXT:    s_mov_b32 s3, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1016,6 +1022,7 @@ define amdgpu_kernel void @v_mul64_sext_c(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX1250-LABEL: v_mul64_sext_c:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
@@ -1179,6 +1186,7 @@ define amdgpu_kernel void @v_mul64_zext_c(ptr addrspace(1) %out, ptr addrspace(1
 ;
 ; GFX1250-LABEL: v_mul64_zext_c:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
@@ -1338,6 +1346,7 @@ define amdgpu_kernel void @v_mul64_sext_inline_imm(ptr addrspace(1) %out, ptr ad
 ;
 ; GFX1250-LABEL: v_mul64_sext_inline_imm:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
@@ -1466,6 +1475,7 @@ define amdgpu_kernel void @s_mul_i32(ptr addrspace(1) %out, [8 x i32], i32 %a, [
 ;
 ; GFX1250-LABEL: s_mul_i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x2
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4c
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x70
@@ -1605,6 +1615,7 @@ define amdgpu_kernel void @v_mul_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
 ;
 ; GFX1250-LABEL: v_mul_i32:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
@@ -1738,6 +1749,7 @@ define amdgpu_kernel void @s_mul_i1(ptr addrspace(1) %out, [8 x i32], i1 %a, [8
 ;
 ; GFX1250-LABEL: s_mul_i1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x2
 ; GFX1250-NEXT:    s_load_b32 s2, s[4:5], 0x4c
 ; GFX1250-NEXT:    s_load_b32 s3, s[4:5], 0x70
@@ -1913,6 +1925,7 @@ define amdgpu_kernel void @v_mul_i1(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ;
 ; GFX1250-LABEL: v_mul_i1:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_mov_b32 s6, -1
 ; GFX1250-NEXT:    s_mov_b32 s7, 0x31016000
@@ -2092,6 +2105,7 @@ define amdgpu_kernel void @s_mul_i64(ptr addrspace(1) %out, i64 %a, i64 %b) noun
 ;
 ; GFX1250-LABEL: s_mul_i64:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
@@ -2293,6 +2307,7 @@ define amdgpu_kernel void @v_mul_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap
 ;
 ; GFX1250-LABEL: v_mul_i64:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    s_load_b64 s[8:9], s[4:5], 0x34
@@ -2558,6 +2573,7 @@ define amdgpu_kernel void @mul32_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1250-LABEL: mul32_in_branch:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x34
 ; GFX1250-NEXT:    s_mov_b32 s6, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2846,6 +2862,7 @@ define amdgpu_kernel void @mul64_in_branch(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1250-LABEL: mul64_in_branch:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_cmp_lg_u64 s[12:13], 0
@@ -3216,6 +3233,7 @@ define amdgpu_kernel void @s_mul_i128(ptr addrspace(1) %out, [8 x i32], i128 %a,
 ;
 ; GFX1250-LABEL: s_mul_i128:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x2
 ; GFX1250-NEXT:    s_load_b128 s[8:11], s[4:5], 0x7c
 ; GFX1250-NEXT:    s_load_b128 s[12:15], s[4:5], 0x4c
@@ -3537,6 +3555,7 @@ define amdgpu_kernel void @v_mul_i128(ptr addrspace(1) %out, ptr addrspace(1) %a
 ;
 ; GFX1250-LABEL: v_mul_i128:
 ; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x2c
 ; GFX1250-NEXT:    v_and_b32_e32 v16, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 6b45d31da0e95..1ed024f7aed36 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -34,6 +34,7 @@ define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
 ;
 ; GFX1250-LABEL: fadd_v2_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -77,6 +78,7 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
 ;
 ; GFX1250-LABEL: fadd_v2_vs:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -141,6 +143,7 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_v4_vs:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
@@ -158,6 +161,7 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
 ;
 ; GFX1250-GISEL-LABEL: fadd_v4_vs:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
@@ -343,6 +347,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_v32_vs:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_clause 0x2
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
@@ -409,6 +414,7 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ;
 ; GFX1250-GISEL-LABEL: fadd_v32_vs:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
@@ -527,6 +533,7 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_v2_v_imm:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s2, 0x42c80000
@@ -539,6 +546,7 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
 ;
 ; GFX1250-GISEL-LABEL: fadd_v2_v_imm:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s2, 0x42c80000
@@ -599,6 +607,7 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_v2_v_v_splat:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -610,6 +619,7 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) {
 ;
 ; GFX1250-GISEL-LABEL: fadd_v2_v_v_splat:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -672,6 +682,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_v2_v_lit_splat:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -683,6 +694,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
 ;
 ; GFX1250-GISEL-LABEL: fadd_v2_v_lit_splat:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s2, 1.0
@@ -731,6 +743,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_v2_v_lit_hi0:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0x3f800000
@@ -743,6 +756,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
 ;
 ; GFX1250-GISEL-LABEL: fadd_v2_v_lit_hi0:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_mov_b64 s[2:3], 0x3f800000
@@ -791,6 +805,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_v2_v_lit_lo0:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0x3f80000000000000
@@ -803,6 +818,7 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) {
 ;
 ; GFX1250-GISEL-LABEL: fadd_v2_v_lit_lo0:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_mov_b64 s[2:3], 0x3f80000000000000
@@ -851,6 +867,7 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_v2_v_unfoldable_lit:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0x400000003f800000
@@ -863,6 +880,7 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
 ;
 ; GFX1250-GISEL-LABEL: fadd_v2_v_unfoldable_lit:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_mov_b64 s[2:3], 0x400000003f800000
@@ -927,6 +945,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_v2_v_fneg:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -938,6 +957,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg(ptr addrspace(1) %a, float %x) {
 ;
 ; GFX1250-GISEL-LABEL: fadd_v2_v_fneg:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -1004,6 +1024,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_lo:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -1015,6 +1036,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo(ptr addrspace(1) %a, float %x) {
 ;
 ; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_lo:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -1081,6 +1103,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_hi:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -1092,6 +1115,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi(ptr addrspace(1) %a, float %x) {
 ;
 ; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_hi:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -1155,6 +1179,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, flo
 ;
 ; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_lo2:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -1168,6 +1193,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_lo2(ptr addrspace(1) %a, float %x, flo
 ;
 ; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_lo2:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -1231,6 +1257,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, flo
 ;
 ; GFX1250-SDAG-LABEL: fadd_v2_v_fneg_hi2:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -1244,6 +1271,7 @@ define amdgpu_kernel void @fadd_v2_v_fneg_hi2(ptr addrspace(1) %a, float %x, flo
 ;
 ; GFX1250-GISEL-LABEL: fadd_v2_v_fneg_hi2:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -1293,6 +1321,7 @@ define amdgpu_kernel void @fmul_v2_vv(ptr addrspace(1) %a) {
 ;
 ; GFX1250-LABEL: fmul_v2_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1336,6 +1365,7 @@ define amdgpu_kernel void @fmul_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
 ;
 ; GFX1250-LABEL: fmul_v2_vs:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -1400,6 +1430,7 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
 ;
 ; GFX1250-SDAG-LABEL: fmul_v4_vs:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
@@ -1417,6 +1448,7 @@ define amdgpu_kernel void @fmul_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
 ;
 ; GFX1250-GISEL-LABEL: fmul_v4_vs:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
@@ -1602,6 +1634,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ;
 ; GFX1250-SDAG-LABEL: fmul_v32_vs:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_clause 0x2
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    s_load_b512 s[36:51], s[4:5], 0xa4
@@ -1668,6 +1701,7 @@ define amdgpu_kernel void @fmul_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ;
 ; GFX1250-GISEL-LABEL: fmul_v32_vs:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
@@ -1785,6 +1819,7 @@ define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) {
 ;
 ; GFX1250-SDAG-LABEL: fmul_v2_v_imm:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s2, 0x42c80000
@@ -1797,6 +1832,7 @@ define amdgpu_kernel void @fmul_v2_v_imm(ptr addrspace(1) %a) {
 ;
 ; GFX1250-GISEL-LABEL: fmul_v2_v_imm:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s2, 0x42c80000
@@ -1857,6 +1893,7 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) {
 ;
 ; GFX1250-SDAG-LABEL: fmul_v2_v_v_splat:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -1868,6 +1905,7 @@ define amdgpu_kernel void @fmul_v2_v_v_splat(ptr addrspace(1) %a) {
 ;
 ; GFX1250-GISEL-LABEL: fmul_v2_v_v_splat:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -1930,6 +1968,7 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
 ;
 ; GFX1250-SDAG-LABEL: fmul_v2_v_lit_splat:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -1941,6 +1980,7 @@ define amdgpu_kernel void @fmul_v2_v_lit_splat(ptr addrspace(1) %a) {
 ;
 ; GFX1250-GISEL-LABEL: fmul_v2_v_lit_splat:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s2, 4.0
@@ -1990,6 +2030,7 @@ define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
 ;
 ; GFX1250-SDAG-LABEL: fmul_v2_v_unfoldable_lit:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0x4040000040800000
@@ -2002,6 +2043,7 @@ define amdgpu_kernel void @fmul_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
 ;
 ; GFX1250-GISEL-LABEL: fmul_v2_v_unfoldable_lit:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_mov_b64 s[2:3], 0x4040000040800000
@@ -2065,6 +2107,7 @@ define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) {
 ;
 ; GFX1250-SDAG-LABEL: fmul_v2_v_fneg:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -2076,6 +2119,7 @@ define amdgpu_kernel void @fmul_v2_v_fneg(ptr addrspace(1) %a, float %x) {
 ;
 ; GFX1250-GISEL-LABEL: fmul_v2_v_fneg:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -2125,6 +2169,7 @@ define amdgpu_kernel void @fma_v2_vv(ptr addrspace(1) %a) {
 ;
 ; GFX1250-LABEL: fma_v2_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2168,6 +2213,7 @@ define amdgpu_kernel void @fma_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
 ;
 ; GFX1250-LABEL: fma_v2_vs:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -2232,6 +2278,7 @@ define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
 ;
 ; GFX1250-SDAG-LABEL: fma_v4_vs:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
@@ -2249,6 +2296,7 @@ define amdgpu_kernel void @fma_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
 ;
 ; GFX1250-GISEL-LABEL: fma_v4_vs:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x34
@@ -2434,6 +2482,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ;
 ; GFX1250-SDAG-LABEL: fma_v32_vs:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
@@ -2499,6 +2548,7 @@ define amdgpu_kernel void @fma_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 ;
 ; GFX1250-GISEL-LABEL: fma_v32_vs:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
@@ -2639,6 +2689,7 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
 ;
 ; GFX1250-SDAG-LABEL: fma_v2_v_imm:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_mov_b32 s2, 0x43480000
@@ -2653,6 +2704,7 @@ define amdgpu_kernel void @fma_v2_v_imm(ptr addrspace(1) %a) {
 ;
 ; GFX1250-GISEL-LABEL: fma_v2_v_imm:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v6, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s2, 0x42c80000
@@ -2716,6 +2768,7 @@ define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) {
 ;
 ; GFX1250-SDAG-LABEL: fma_v2_v_v_splat:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -2727,6 +2780,7 @@ define amdgpu_kernel void @fma_v2_v_v_splat(ptr addrspace(1) %a) {
 ;
 ; GFX1250-GISEL-LABEL: fma_v2_v_v_splat:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
@@ -2809,6 +2863,7 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
 ;
 ; GFX1250-SDAG-LABEL: fma_v2_v_lit_splat:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -2820,6 +2875,7 @@ define amdgpu_kernel void @fma_v2_v_lit_splat(ptr addrspace(1) %a) {
 ;
 ; GFX1250-GISEL-LABEL: fma_v2_v_lit_splat:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v6, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_mov_b32 s2, 4.0
@@ -2909,6 +2965,7 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
 ;
 ; GFX1250-SDAG-LABEL: fma_v2_v_unfoldable_lit:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v6, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    v_mov_b64_e32 v[2:3], 0x4040000040800000
@@ -2922,6 +2979,7 @@ define amdgpu_kernel void @fma_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
 ;
 ; GFX1250-GISEL-LABEL: fma_v2_v_unfoldable_lit:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v6, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_mov_b64 s[2:3], 0x4040000040800000
@@ -2987,6 +3045,7 @@ define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) {
 ;
 ; GFX1250-SDAG-LABEL: fma_v2_v_fneg:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -2998,6 +3057,7 @@ define amdgpu_kernel void @fma_v2_v_fneg(ptr addrspace(1) %a, float %x) {
 ;
 ; GFX1250-GISEL-LABEL: fma_v2_v_fneg:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -3068,6 +3128,7 @@ define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %ou
 ;
 ; GFX1250-SDAG-LABEL: add_vector_neg_bitcast_scalar_lo:
 ; GFX1250-SDAG:       ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2
@@ -3081,6 +3142,7 @@ define amdgpu_kernel void @add_vector_neg_bitcast_scalar_lo(ptr addrspace(1) %ou
 ;
 ; GFX1250-GISEL-LABEL: add_vector_neg_bitcast_scalar_lo:
 ; GFX1250-GISEL:       ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3
@@ -3161,6 +3223,7 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa
 ;
 ; GFX1250-SDAG-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
 ; GFX1250-SDAG:       ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v6, 0 :: v_dual_mov_b32 v2, s2
@@ -3176,6 +3239,7 @@ define amdgpu_kernel void @fma_vector_vector_neg_scalar_lo_scalar_hi(ptr addrspa
 ;
 ; GFX1250-GISEL-LABEL: fma_vector_vector_neg_scalar_lo_scalar_hi:
 ; GFX1250-GISEL:       ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, s3
@@ -3255,6 +3319,7 @@ define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1250-SDAG-LABEL: shuffle_add_f32:
 ; GFX1250-SDAG:       ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v2, s2
@@ -3267,6 +3332,7 @@ define amdgpu_kernel void @shuffle_add_f32(ptr addrspace(1) %out, ptr addrspace(
 ;
 ; GFX1250-GISEL-LABEL: shuffle_add_f32:
 ; GFX1250-GISEL:       ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s2
@@ -3343,6 +3409,7 @@ define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-SDAG-LABEL: shuffle_neg_add_f32:
 ; GFX1250-SDAG:       ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v2, s2
@@ -3358,6 +3425,7 @@ define amdgpu_kernel void @shuffle_neg_add_f32(ptr addrspace(1) %out, ptr addrsp
 ;
 ; GFX1250-GISEL-LABEL: shuffle_neg_add_f32:
 ; GFX1250-GISEL:       ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b96 s[0:2], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, s2
@@ -3433,6 +3501,7 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_fadd_fsub_0:
 ; GFX1250-SDAG:       ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_add_f32 s1, s1, 0
@@ -3444,6 +3513,7 @@ define amdgpu_kernel void @fadd_fadd_fsub_0(<2 x float> %arg) {
 ;
 ; GFX1250-GISEL-LABEL: fadd_fadd_fsub_0:
 ; GFX1250-GISEL:       ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-NEXT:    v_mov_b64_e32 v[0:1], s[0:1]
@@ -3533,6 +3603,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
 ;
 ; GFX1250-SDAG-LABEL: fadd_fadd_fsub:
 ; GFX1250-SDAG:       ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_clause 0x1
 ; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
@@ -3550,6 +3621,7 @@ define amdgpu_kernel void @fadd_fadd_fsub(<2 x float> %arg, <2 x float> %arg1, p
 ;
 ; GFX1250-GISEL-LABEL: fadd_fadd_fsub:
 ; GFX1250-GISEL:       ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_clause 0x1
 ; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
@@ -3621,6 +3693,7 @@ define amdgpu_kernel void @fadd_shuffle_v4(ptr addrspace(1) %arg) {
 ;
 ; GFX1250-SDAG-LABEL: fadd_shuffle_v4:
 ; GFX1250-SDAG:       ; %bb.0: ; %bb
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -3633,6 +3706,7 @@ define amdgpu_kernel void @fadd_shuffle_v4(ptr addrspace(1) %arg) {
 ;
 ; GFX1250-GISEL-LABEL: fadd_shuffle_v4:
 ; GFX1250-GISEL:       ; %bb.0: ; %bb
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v6, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -3693,6 +3767,7 @@ define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) {
 ;
 ; GFX1250-SDAG-LABEL: fneg_v2f32_vec:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
@@ -3704,6 +3779,7 @@ define amdgpu_kernel void @fneg_v2f32_vec(ptr addrspace(1) %a) {
 ;
 ; GFX1250-GISEL-LABEL: fneg_v2f32_vec:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0
@@ -3756,6 +3832,7 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x
 ;
 ; GFX1250-SDAG-LABEL: fneg_v2f32_scalar:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-SDAG-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-SDAG-NEXT:    s_xor_b32 s2, s2, 0x80000000
@@ -3767,6 +3844,7 @@ define amdgpu_kernel void @fneg_v2f32_scalar(ptr addrspace(1) %a, <2 x float> %x
 ;
 ; GFX1250-GISEL-LABEL: fneg_v2f32_scalar:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-GISEL-NEXT:    s_wait_kmcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index 6671201ca2b94..6f222aa7d6977 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -34,6 +34,7 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg %out) #0
 ;
 ; GFX1250-LABEL: preload_block_count_x:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
@@ -75,6 +76,7 @@ define amdgpu_kernel void @preload_unused_arg_block_count_x(ptr addrspace(1) inr
 ;
 ; GFX1250-LABEL: preload_unused_arg_block_count_x:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
@@ -117,6 +119,7 @@ define amdgpu_kernel void @no_free_sgprs_block_count_x(ptr addrspace(1) inreg %o
 ;
 ; GFX1250-LABEL: no_free_sgprs_block_count_x:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s18
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[8:9]
 ; GFX1250-NEXT:    s_endpgm
@@ -149,6 +152,7 @@ define amdgpu_kernel void @no_inreg_block_count_x(ptr addrspace(1) %out) #0 {
 ;
 ; GFX1250-LABEL: no_inreg_block_count_x:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[0:1], 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
@@ -186,6 +190,7 @@ define amdgpu_kernel void @mixed_inreg_block_count_x(ptr addrspace(1) %out, i32
 ;
 ; GFX1250-LABEL: mixed_inreg_block_count_x:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b32 s4, s[0:1], 0x10
 ; GFX1250-NEXT:    s_load_b64 s[2:3], s[0:1], 0x0
@@ -232,6 +237,7 @@ define amdgpu_kernel void @incorrect_type_i64_block_count_x(ptr addrspace(1) inr
 ;
 ; GFX1250-LABEL: incorrect_type_i64_block_count_x:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[0:1], s[0:1], 0x8
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -277,6 +283,7 @@ define amdgpu_kernel void @incorrect_type_i16_block_count_x(ptr addrspace(1) inr
 ;
 ; GFX1250-LABEL: incorrect_type_i16_block_count_x:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX1250-NEXT:    global_load_u16 v1, v0, s[0:1] offset:8
 ; GFX1250-NEXT:    s_wait_loadcnt 0x0
@@ -318,6 +325,7 @@ define amdgpu_kernel void @preload_block_count_y(ptr addrspace(1) inreg %out) #0
 ;
 ; GFX1250-LABEL: preload_block_count_y:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
@@ -361,6 +369,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
 ;
 ; GFX1250-LABEL: random_incorrect_offset:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b32 s0, s[0:1], 0xa
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
@@ -405,6 +414,7 @@ define amdgpu_kernel void @preload_block_count_z(ptr addrspace(1) inreg %out) #0
 ;
 ; GFX1250-LABEL: preload_block_count_z:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s6
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
@@ -451,6 +461,7 @@ define amdgpu_kernel void @preload_block_count_x_imparg_align_ptr_i8(ptr addrspa
 ;
 ; GFX1250-LABEL: preload_block_count_x_imparg_align_ptr_i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_and_b32 s0, s4, 0xff
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_add_co_i32 s0, s6, s0
@@ -501,6 +512,7 @@ define amdgpu_kernel void @preload_block_count_xyz(ptr addrspace(1) inreg %out)
 ;
 ; GFX1250-LABEL: preload_block_count_xyz:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
 ; GFX1250-NEXT:    global_store_b96 v3, v[0:2], s[2:3]
@@ -552,6 +564,7 @@ define amdgpu_kernel void @preload_workgroup_size_x(ptr addrspace(1) inreg %out)
 ;
 ; GFX1250-LABEL: preload_workgroup_size_x:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_and_b32 s0, s7, 0xffff
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
@@ -598,6 +611,7 @@ define amdgpu_kernel void @preload_workgroup_size_y(ptr addrspace(1) inreg %out)
 ;
 ; GFX1250-LABEL: preload_workgroup_size_y:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_lshr_b32 s0, s7, 16
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
@@ -646,6 +660,7 @@ define amdgpu_kernel void @preload_workgroup_size_z(ptr addrspace(1) inreg %out)
 ;
 ; GFX1250-LABEL: preload_workgroup_size_z:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_and_b32 s0, s8, 0xffff
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
@@ -702,6 +717,7 @@ define amdgpu_kernel void @preload_workgroup_size_xyz(ptr addrspace(1) inreg %ou
 ;
 ; GFX1250-LABEL: preload_workgroup_size_xyz:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_lshr_b32 s0, s7, 16
 ; GFX1250-NEXT:    s_and_b32 s1, s7, 0xffff
 ; GFX1250-NEXT:    s_and_b32 s4, s8, 0xffff
@@ -761,6 +777,7 @@ define amdgpu_kernel void @preload_remainder_x(ptr addrspace(1) inreg %out) #0 {
 ;
 ; GFX1250-LABEL: preload_remainder_x:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_lshr_b32 s0, s8, 16
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
@@ -807,6 +824,7 @@ define amdgpu_kernel void @preloadremainder_y(ptr addrspace(1) inreg %out) #0 {
 ;
 ; GFX1250-LABEL: preloadremainder_y:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_and_b32 s0, s9, 0xffff
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
@@ -853,6 +871,7 @@ define amdgpu_kernel void @preloadremainder_z(ptr addrspace(1) inreg %out) #0 {
 ;
 ; GFX1250-LABEL: preloadremainder_z:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_lshr_b32 s0, s9, 16
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
@@ -907,6 +926,7 @@ define amdgpu_kernel void @preloadremainder_xyz(ptr addrspace(1) inreg %out) #0
 ;
 ; GFX1250-LABEL: preloadremainder_xyz:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_lshr_b32 s0, s9, 16
 ; GFX1250-NEXT:    s_lshr_b32 s1, s8, 16
 ; GFX1250-NEXT:    s_and_b32 s4, s9, 0xffff
@@ -964,6 +984,7 @@ define amdgpu_kernel void @no_free_sgprs_preloadremainder_z(ptr addrspace(1) inr
 ;
 ; GFX1250-LABEL: no_free_sgprs_preloadremainder_z:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_lshr_b32 s0, s15, 16
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
@@ -1012,6 +1033,7 @@ define amdgpu_kernel void @preload_block_max_user_sgprs(ptr addrspace(1) inreg %
 ;
 ; GFX1250-LABEL: preload_block_max_user_sgprs:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s12
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
@@ -1060,6 +1082,7 @@ define amdgpu_kernel void @preload_block_count_z_workgroup_size_z_remainder_z(pt
 ;
 ; GFX1250-LABEL: preload_block_count_z_workgroup_size_z_remainder_z:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_lshr_b32 s0, s9, 16
 ; GFX1250-NEXT:    s_and_b32 s1, s8, 0xffff
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s6

diff  --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index c1764c94ea2de..c3e283e8a01bf 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -36,6 +36,7 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) inreg %out, i8 inreg %arg0)
 ;
 ; GFX1250-LABEL: ptr1_i8:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_and_b32 s0, s4, 0xff
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
@@ -79,6 +80,7 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) inreg %out, i8 zero
 ;
 ; GFX1250-LABEL: ptr1_i8_zext_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_and_b32 s0, s4, 0xff
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
@@ -122,6 +124,7 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) inreg %out, i16
 ;
 ; GFX1250-LABEL: ptr1_i16_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_and_b32 s0, s4, 0xffff
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
@@ -163,6 +166,7 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) inreg %out, i32
 ;
 ; GFX1250-LABEL: ptr1_i32_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
@@ -205,6 +209,7 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 inreg %arg0, ptr addrspa
 ;
 ; GFX1250-LABEL: i32_ptr1_i32_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_add_co_i32 s0, s2, s6
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
@@ -252,6 +257,7 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) inreg %out,
 ;
 ; GFX1250-LABEL: ptr1_i16_i16_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_lshr_b32 s0, s4, 16
 ; GFX1250-NEXT:    s_and_b32 s1, s4, 0xffff
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
@@ -297,6 +303,7 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) inreg %out, <2
 ;
 ; GFX1250-LABEL: ptr1_v2i8_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
 ; GFX1250-NEXT:    global_store_b16 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
@@ -346,6 +353,7 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) inreg %out, ptr ad
 ;
 ; GFX1250-LABEL: byref_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[4:5], s[0:1], 0x100
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
@@ -404,6 +412,7 @@ define amdgpu_kernel void @byref_staggered_preload_arg(ptr addrspace(1) inreg %o
 ;
 ; GFX1250-LABEL: byref_staggered_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b64 s[4:5], s[0:1], 0x100
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
@@ -471,6 +480,7 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture inreg %out, <8 x
 ;
 ; GFX1250-LABEL: v8i32_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s14
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, s15 :: v_dual_mov_b32 v2, s16
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, s17 :: v_dual_mov_b32 v4, s10
@@ -518,6 +528,7 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture inreg %o
 ;
 ; GFX1250-LABEL: v3i16_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1250-NEXT:    s_clause 0x1
@@ -563,6 +574,7 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture inreg %o
 ;
 ; GFX1250-LABEL: v3i32_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1250-NEXT:    v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, 0
 ; GFX1250-NEXT:    global_store_b96 v3, v[0:2], s[2:3]
@@ -606,6 +618,7 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture inreg %o
 ;
 ; GFX1250-LABEL: v3f32_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s6
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s8
 ; GFX1250-NEXT:    global_store_b96 v3, v[0:2], s[2:3]
@@ -662,6 +675,7 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture inreg %ou
 ;
 ; GFX1250-LABEL: v5i8_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_pack_lh_b32_b16 s0, 0, s4
 ; GFX1250-NEXT:    s_and_b32 s1, s4, 0xffff
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
@@ -733,6 +747,7 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture inreg %out, <5 x
 ;
 ; GFX1250-LABEL: v5f64_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b64 s[12:13], s[0:1], 0x60
 ; GFX1250-NEXT:    s_load_b256 s[4:11], s[0:1], 0x40
@@ -812,6 +827,7 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) inreg %out, <8 x i8
 ;
 ; GFX1250-LABEL: v8i8_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_pack_lh_b32_b16 s0, 0, s5
 ; GFX1250-NEXT:    s_pack_lh_b32_b16 s1, 0, s4
 ; GFX1250-NEXT:    s_and_b32 s4, s4, 0xffff
@@ -857,6 +873,7 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) inreg %out, i
 ;
 ; GFX1250-LABEL: i64_kernel_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
@@ -895,6 +912,7 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) inreg %out, d
 ;
 ; GFX1250-LABEL: f64_kernel_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], s[4:5]
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
@@ -934,6 +952,7 @@ define amdgpu_kernel void @half_kernel_preload_arg(ptr addrspace(1) inreg %out,
 ;
 ; GFX1250-LABEL: half_kernel_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
 ; GFX1250-NEXT:    global_store_b16 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
@@ -972,6 +991,7 @@ define amdgpu_kernel void @bfloat_kernel_preload_arg(ptr addrspace(1) inreg %out
 ;
 ; GFX1250-LABEL: bfloat_kernel_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
 ; GFX1250-NEXT:    global_store_b16 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
@@ -1010,6 +1030,7 @@ define amdgpu_kernel void @v2bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
 ;
 ; GFX1250-LABEL: v2bfloat_kernel_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
 ; GFX1250-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX1250-NEXT:    s_endpgm
@@ -1051,6 +1072,7 @@ define amdgpu_kernel void @v3bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
 ;
 ; GFX1250-LABEL: v3bfloat_kernel_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX1250-NEXT:    s_clause 0x1
@@ -1096,6 +1118,7 @@ define amdgpu_kernel void @v6bfloat_kernel_preload_arg(ptr addrspace(1) inreg %o
 ;
 ; GFX1250-LABEL: v6bfloat_kernel_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1250-NEXT:    v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v3, 0
 ; GFX1250-NEXT:    global_store_b96 v3, v[0:2], s[2:3]
@@ -1148,6 +1171,7 @@ define amdgpu_kernel void @half_v7bfloat_kernel_preload_arg(ptr addrspace(1) inr
 ;
 ; GFX1250-LABEL: half_v7bfloat_kernel_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, s4
 ; GFX1250-NEXT:    v_dual_mov_b32 v5, s9 :: v_dual_mov_b32 v2, s8
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
@@ -1194,6 +1218,7 @@ define amdgpu_kernel void @i1_kernel_preload_arg(ptr addrspace(1) inreg %out, i1
 ;
 ; GFX1250-LABEL: i1_kernel_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_and_b32 s0, s4, 1
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0
@@ -1240,6 +1265,7 @@ define amdgpu_kernel void @fp128_kernel_preload_arg(ptr addrspace(1) inreg %out,
 ;
 ; GFX1250-LABEL: fp128_kernel_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v0, s6
 ; GFX1250-NEXT:    v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s8
 ; GFX1250-NEXT:    v_mov_b32_e32 v3, s9
@@ -1299,6 +1325,7 @@ define amdgpu_kernel void @v7i8_kernel_preload_arg(ptr addrspace(1) inreg %out,
 ;
 ; GFX1250-LABEL: v7i8_kernel_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_pack_lh_b32_b16 s0, 0, s4
 ; GFX1250-NEXT:    s_and_b32 s1, s4, 0xffff
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s5
@@ -1353,6 +1380,7 @@ define amdgpu_kernel void @v7half_kernel_preload_arg(ptr addrspace(1) inreg %out
 ;
 ; GFX1250-LABEL: v7half_kernel_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, s9
 ; GFX1250-NEXT:    v_dual_mov_b32 v2, s8 :: v_dual_mov_b32 v0, s6
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s7
@@ -1399,6 +1427,7 @@ define amdgpu_kernel void @i16_i32_kernel_preload_arg(ptr addrspace(1) inreg %ou
 ;
 ; GFX1250-LABEL: i16_i32_kernel_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX1250-NEXT:    s_clause 0x1
@@ -1450,6 +1479,7 @@ define amdgpu_kernel void @i16_v3i32_kernel_preload_arg(ptr addrspace(1) inreg %
 ;
 ; GFX1250-LABEL: i16_v3i32_kernel_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v4, s4
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
 ; GFX1250-NEXT:    v_mov_b32_e32 v2, s8
@@ -1495,6 +1525,7 @@ define amdgpu_kernel void @i16_i16_kernel_preload_arg(ptr addrspace(1) inreg %ou
 ;
 ; GFX1250-LABEL: i16_i16_kernel_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b16 v0, v1, s[2:3]
@@ -1548,6 +1579,7 @@ define amdgpu_kernel void @i16_v2i8_kernel_preload_arg(ptr addrspace(1) inreg %o
 ;
 ; GFX1250-LABEL: i16_v2i8_kernel_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s4
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    global_store_b16 v0, v1, s[2:3]
@@ -1597,6 +1629,7 @@ define amdgpu_kernel void @i32_ptr1_i32_staggered_preload_arg(i32 inreg %arg0, p
 ;
 ; GFX1250-LABEL: i32_ptr1_i32_staggered_preload_arg:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[4:6], s[0:1], 0x8
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_add_co_i32 s0, s2, s6
@@ -1641,6 +1674,7 @@ define amdgpu_kernel void @ptr1_i8_trailing_unused(ptr addrspace(1) inreg %out,
 ;
 ; GFX1250-LABEL: ptr1_i8_trailing_unused:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_and_b32 s0, s4, 0xff
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
index b2bcb74e4184f..40676cef1bc5e 100644
--- a/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
+++ b/llvm/test/CodeGen/AMDGPU/reassoc-mul-add-1-to-mad.ll
@@ -4083,6 +4083,7 @@ define amdgpu_kernel void @compute_mad(ptr addrspace(4) %i18, ptr addrspace(4) %
 ;
 ; GFX1250-LABEL: compute_mad:
 ; GFX1250:       ; %bb.0: ; %bb
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_load_b96 s[8:10], s[4:5], 0x10
 ; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
@@ -4189,6 +4190,7 @@ define amdgpu_ps i32 @s_mul_add_1_i32(i32 inreg %x, i32 inreg %y) {
 ;
 ; GFX1250-LABEL: s_mul_add_1_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_add_co_i32 s1, s1, 1
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_i32 s0, s0, s1
@@ -4225,6 +4227,7 @@ define amdgpu_ps i32 @s_mul_add_1_i32_commute(i32 inreg %x, i32 inreg %y) {
 ;
 ; GFX1250-LABEL: s_mul_add_1_i32_commute:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_add_co_i32 s1, s1, 1
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_i32 s0, s1, s0

diff  --git a/llvm/test/CodeGen/AMDGPU/s-cluster-barrier.ll b/llvm/test/CodeGen/AMDGPU/s-cluster-barrier.ll
index dc2e09dda2193..9309fd0f5c15f 100644
--- a/llvm/test/CodeGen/AMDGPU/s-cluster-barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-cluster-barrier.ll
@@ -5,6 +5,7 @@
 define amdgpu_kernel void @kernel1() #0 {
 ; GFX12-LABEL: kernel1:
 ; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX12-NEXT:    s_cmp_eq_u32 0, 0
 ; GFX12-NEXT:    s_barrier_signal_isfirst -1
 ; GFX12-NEXT:    s_barrier_wait -1

diff  --git a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
index a18847b56a330..67411721f1b41 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-flat.ll
@@ -5,6 +5,7 @@
 define amdgpu_ps float @flat_load_b32_idxprom(ptr align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: flat_load_b32_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_load_b32 v0, v0, s[0:1] scale_offset
 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -18,6 +19,7 @@ entry:
 define amdgpu_ps float @flat_load_b32_idx32(ptr align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: flat_load_b32_idx32:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_load_b32 v0, v0, s[0:1] scale_offset
 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -30,6 +32,7 @@ entry:
 define amdgpu_ps float @flat_load_b32_idxprom_wrong_stride(ptr align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: flat_load_b32_idxprom_wrong_stride:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1]
@@ -46,6 +49,7 @@ entry:
 define amdgpu_ps float @flat_load_b16_idxprom_ioffset(ptr align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: flat_load_b16_idxprom_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset
 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -62,6 +66,7 @@ entry:
 define amdgpu_ps <2 x float> @flat_load_b64_idxprom(ptr align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: flat_load_b64_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_load_b64 v[0:1], v0, s[0:1] scale_offset
 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -75,6 +80,7 @@ entry:
 define amdgpu_ps <3 x float> @flat_load_b96_idxprom(ptr align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: flat_load_b96_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_load_b96 v[0:2], v0, s[0:1] scale_offset
 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -88,6 +94,7 @@ entry:
 define amdgpu_ps <3 x float> @flat_load_b96_idxpromi_ioffset(ptr align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: flat_load_b96_idxpromi_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -102,6 +109,7 @@ entry:
 define amdgpu_ps <4 x float> @flat_load_b128_idxprom(ptr align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: flat_load_b128_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_load_b128 v[0:3], v0, s[0:1] scale_offset
 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -115,6 +123,7 @@ entry:
 define amdgpu_ps float @flat_load_b32_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
 ; GCN-LABEL: flat_load_b32_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_load_b32 v0, v[0:1]
 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    flat_load_b32 v0, v0, s[0:1] scale_offset
@@ -131,6 +140,7 @@ entry:
 define amdgpu_ps float @flat_load_b32_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
 ; GCN-LABEL: flat_load_b32_idxprom_range_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_load_b32 v0, v[0:1]
 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    flat_load_b32 v0, v0, s[0:1] offset:64 scale_offset
@@ -150,6 +160,7 @@ entry:
 define amdgpu_ps float @flat_load_b8_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
 ; GCN-LABEL: flat_load_b8_idxprom_range_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_load_b32 v0, v[0:1]
 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    flat_load_u8 v0, v0, s[0:1] offset:16
@@ -169,6 +180,7 @@ entry:
 define amdgpu_ps float @flat_load_b16_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
 ; GCN-LABEL: flat_load_b16_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_load_b32 v0, v[0:1]
 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    flat_load_u16 v0, v0, s[0:1] scale_offset
@@ -187,6 +199,7 @@ entry:
 define amdgpu_ps float @flat_load_b16_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
 ; GCN-LABEL: flat_load_b16_idxprom_range_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_load_b32 v0, v[0:1]
 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    flat_load_u16 v0, v0, s[0:1] offset:32 scale_offset
@@ -206,6 +219,7 @@ entry:
 define amdgpu_ps <2 x float> @flat_load_b64_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
 ; GCN-LABEL: flat_load_b64_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_load_b32 v0, v[0:1]
 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    flat_load_b64 v[0:1], v0, s[0:1] scale_offset
@@ -222,6 +236,7 @@ entry:
 define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
 ; GCN-LABEL: flat_load_b96_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_load_b32 v0, v[0:1]
 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    flat_load_b96 v[0:2], v0, s[0:1] scale_offset
@@ -238,6 +253,7 @@ entry:
 define amdgpu_ps <3 x float> @flat_load_b96_idxprom_range_ioffset(ptr align 4 inreg %p, ptr align 4 %pp) {
 ; GCN-LABEL: flat_load_b96_idxprom_range_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_load_b32 v0, v[0:1]
 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    flat_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
@@ -255,6 +271,7 @@ entry:
 define amdgpu_ps <4 x float> @flat_load_b128_idxprom_range(ptr align 4 inreg %p, ptr align 4 %pp) {
 ; GCN-LABEL: flat_load_b128_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    flat_load_b32 v0, v[0:1]
 ; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GCN-NEXT:    flat_load_b128 v[0:3], v0, s[0:1] scale_offset
@@ -271,6 +288,7 @@ entry:
 define amdgpu_ps void @flat_store_b32_idxprom(ptr align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: flat_store_b32_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GCN-NEXT:    flat_store_b32 v0, v1, s[0:1] scale_offset
 ; GCN-NEXT:    s_endpgm
@@ -284,6 +302,7 @@ entry:
 define amdgpu_ps void @flat_store_b16_idxprom(ptr align 2 inreg %p, i32 %idx) {
 ; GCN-LABEL: flat_store_b16_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1
 ; GCN-NEXT:    flat_store_b16 v0, v1, s[0:1] scale_offset
 ; GCN-NEXT:    s_endpgm
@@ -297,6 +316,7 @@ entry:
 define amdgpu_ps void @flat_store_b64_idxprom(ptr align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: flat_store_b64_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b64_e32 v[2:3], 1.0
 ; GCN-NEXT:    flat_store_b64 v0, v[2:3], s[0:1] scale_offset
 ; GCN-NEXT:    s_endpgm
@@ -310,6 +330,7 @@ entry:
 define amdgpu_ps void @flat_atomicrmw_b32_idxprom(ptr align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: flat_atomicrmw_b32_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1
 ; GCN-NEXT:    flat_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
 ; GCN-NEXT:    s_endpgm
@@ -323,6 +344,7 @@ entry:
 define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %p, i32 %idx) {
 ; SDAG-LABEL: flat_atomicrmw_b64_rtn_idxprom:
 ; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; SDAG-NEXT:    v_lshl_add_u64 v[2:3], v[0:1], 3, s[0:1]
@@ -364,6 +386,7 @@ define amdgpu_ps <2 x float> @flat_atomicrmw_b64_rtn_idxprom(ptr align 8 inreg %
 ;
 ; GISEL-LABEL: flat_atomicrmw_b64_rtn_idxprom:
 ; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    v_mov_b32_e32 v2, v0
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v3, 31, v2

diff  --git a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll
index a0fca0e2bdc72..d2db2bb19f15b 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-global.ll
@@ -5,6 +5,7 @@
 define amdgpu_ps float @global_load_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: global_load_b32_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -18,6 +19,7 @@ entry:
 define amdgpu_ps float @global_load_b32_idx32(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: global_load_b32_idx32:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -30,6 +32,7 @@ entry:
 define amdgpu_ps float @global_load_b32_idxprom_wrong_stride(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: global_load_b32_idxprom_wrong_stride:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
 ; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GCN-NEXT:    v_lshl_add_u64 v[0:1], v[0:1], 3, s[0:1]
@@ -46,6 +49,7 @@ entry:
 define amdgpu_ps float @global_load_b16_idxprom_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: global_load_b16_idxprom_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_load_u16 v0, v0, s[0:1] offset:32 scale_offset
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -62,6 +66,7 @@ entry:
 define amdgpu_ps <2 x float> @global_load_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: global_load_b64_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_load_b64 v[0:1], v0, s[0:1] scale_offset
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -75,6 +80,7 @@ entry:
 define amdgpu_ps <3 x float> @global_load_b96_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: global_load_b96_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_load_b96 v[0:2], v0, s[0:1] scale_offset
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -88,6 +94,7 @@ entry:
 define amdgpu_ps <3 x float> @global_load_b96_idxpromi_ioffset(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: global_load_b96_idxpromi_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -102,6 +109,7 @@ entry:
 define amdgpu_ps <4 x float> @global_load_b128_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: global_load_b128_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_load_b128 v[0:3], v0, s[0:1] scale_offset
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -115,6 +123,7 @@ entry:
 define amdgpu_ps float @global_load_b32_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
 ; GCN-LABEL: global_load_b32_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_load_b32 v0, v[0:1], off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    global_load_b32 v0, v0, s[0:1] scale_offset
@@ -131,6 +140,7 @@ entry:
 define amdgpu_ps float @global_load_b32_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
 ; GCN-LABEL: global_load_b32_idxprom_range_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_load_b32 v0, v[0:1], off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    global_load_b32 v0, v0, s[0:1] offset:64 scale_offset
@@ -150,6 +160,7 @@ entry:
 define amdgpu_ps float @global_load_b8_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
 ; GCN-LABEL: global_load_b8_idxprom_range_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_load_b32 v0, v[0:1], off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    global_load_u8 v0, v0, s[0:1] offset:16
@@ -169,6 +180,7 @@ entry:
 define amdgpu_ps float @global_load_b16_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
 ; GCN-LABEL: global_load_b16_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_load_b32 v0, v[0:1], off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    global_load_u16 v0, v0, s[0:1] scale_offset
@@ -187,6 +199,7 @@ entry:
 define amdgpu_ps float @global_load_b16_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
 ; GCN-LABEL: global_load_b16_idxprom_range_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_load_b32 v0, v[0:1], off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    global_load_u16 v0, v0, s[0:1] offset:32 scale_offset
@@ -206,6 +219,7 @@ entry:
 define amdgpu_ps <2 x float> @global_load_b64_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
 ; GCN-LABEL: global_load_b64_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_load_b32 v0, v[0:1], off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    global_load_b64 v[0:1], v0, s[0:1] scale_offset
@@ -222,6 +236,7 @@ entry:
 define amdgpu_ps <3 x float> @global_load_b96_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
 ; GCN-LABEL: global_load_b96_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_load_b32 v0, v[0:1], off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    global_load_b96 v[0:2], v0, s[0:1] scale_offset
@@ -238,6 +253,7 @@ entry:
 define amdgpu_ps <3 x float> @global_load_b96_idxprom_range_ioffset(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
 ; GCN-LABEL: global_load_b96_idxprom_range_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_load_b32 v0, v[0:1], off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    global_load_b96 v[0:2], v0, s[0:1] offset:192 scale_offset
@@ -255,6 +271,7 @@ entry:
 define amdgpu_ps <4 x float> @global_load_b128_idxprom_range(ptr addrspace(1) align 4 inreg %p, ptr addrspace(1) align 4 %pp) {
 ; GCN-LABEL: global_load_b128_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    global_load_b32 v0, v[0:1], off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    global_load_b128 v[0:3], v0, s[0:1] scale_offset
@@ -271,6 +288,7 @@ entry:
 define amdgpu_ps void @global_store_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: global_store_b32_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GCN-NEXT:    global_store_b32 v0, v1, s[0:1] scale_offset
 ; GCN-NEXT:    s_endpgm
@@ -284,6 +302,7 @@ entry:
 define amdgpu_ps void @global_store_b16_idxprom(ptr addrspace(1) align 2 inreg %p, i32 %idx) {
 ; GCN-LABEL: global_store_b16_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1
 ; GCN-NEXT:    global_store_b16 v0, v1, s[0:1] scale_offset
 ; GCN-NEXT:    s_endpgm
@@ -297,6 +316,7 @@ entry:
 define amdgpu_ps void @global_store_b64_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: global_store_b64_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b64_e32 v[2:3], 1.0
 ; GCN-NEXT:    global_store_b64 v0, v[2:3], s[0:1] scale_offset
 ; GCN-NEXT:    s_endpgm
@@ -310,6 +330,7 @@ entry:
 define amdgpu_ps void @global_atomicrmw_b32_idxprom(ptr addrspace(1) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: global_atomicrmw_b32_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1
 ; GCN-NEXT:    global_atomic_add_u32 v0, v1, s[0:1] scale_offset scope:SCOPE_SYS
 ; GCN-NEXT:    s_endpgm
@@ -323,6 +344,7 @@ entry:
 define amdgpu_ps <2 x float> @global_atomicrmw_b64_rtn_idxprom(ptr addrspace(1) align 8 inreg %p, i32 %idx) {
 ; GCN-LABEL: global_atomicrmw_b64_rtn_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b64_e32 v[2:3], 1
 ; GCN-NEXT:    global_atomic_add_u64 v[0:1], v0, v[2:3], s[0:1] scale_offset th:TH_ATOMIC_RETURN scope:SCOPE_SYS
 ; GCN-NEXT:    s_wait_loadcnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll
index ba4fedf5bb009..438601f8cae35 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll
@@ -5,6 +5,7 @@
 define amdgpu_ps float @scratch_load_b32_alloca_idxprom(i32 %idx) {
 ; GCN-LABEL: scratch_load_b32_alloca_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_b32 v0, v0, off scale_offset
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -19,6 +20,7 @@ entry:
 define amdgpu_ps float @scratch_load_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: scratch_load_b32_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_b32 v0, v0, s0 scale_offset
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -32,6 +34,7 @@ entry:
 define amdgpu_ps float @scratch_load_b32_idx32(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: scratch_load_b32_idx32:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_b32 v0, v0, s0 scale_offset
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -44,6 +47,7 @@ entry:
 define amdgpu_ps float @scratch_load_b32_idxprom_wrong_stride(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: scratch_load_b32_idxprom_wrong_stride:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
 ; GCN-NEXT:    scratch_load_b32 v0, v0, s0
 ; GCN-NEXT:    s_wait_loadcnt 0x0
@@ -58,6 +62,7 @@ entry:
 define amdgpu_ps float @scratch_load_b16_idxprom_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: scratch_load_b16_idxprom_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_u16 v0, v0, s0 offset:32 scale_offset
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -74,6 +79,7 @@ entry:
 define amdgpu_ps <2 x float> @scratch_load_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: scratch_load_b64_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_b64 v[0:1], v0, s0 scale_offset
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -87,6 +93,7 @@ entry:
 define amdgpu_ps <3 x float> @scratch_load_b96_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: scratch_load_b96_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_b96 v[0:2], v0, s0 scale_offset
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -100,6 +107,7 @@ entry:
 define amdgpu_ps <3 x float> @scratch_load_b96_idxpromi_ioffset(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: scratch_load_b96_idxpromi_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -114,6 +122,7 @@ entry:
 define amdgpu_ps <4 x float> @scratch_load_b128_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: scratch_load_b128_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_b128 v[0:3], v0, s0 scale_offset
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    ; return to shader part epilog
@@ -127,6 +136,7 @@ entry:
 define amdgpu_ps float @scratch_load_b32_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
 ; GCN-LABEL: scratch_load_b32_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_b32 v0, v0, off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    scratch_load_b32 v0, v0, s0 scale_offset
@@ -143,6 +153,7 @@ entry:
 define amdgpu_ps float @scratch_load_b32_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
 ; GCN-LABEL: scratch_load_b32_idxprom_range_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_b32 v0, v0, off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    scratch_load_b32 v0, v0, s0 offset:64 scale_offset
@@ -160,6 +171,7 @@ entry:
 define amdgpu_ps float @scratch_load_b8_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
 ; GCN-LABEL: scratch_load_b8_idxprom_range_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_b32 v0, v0, off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    scratch_load_u8 v0, v0, s0 offset:16
@@ -179,6 +191,7 @@ entry:
 define amdgpu_ps float @scratch_load_b16_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
 ; GCN-LABEL: scratch_load_b16_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_b32 v0, v0, off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    scratch_load_u16 v0, v0, s0 scale_offset
@@ -197,6 +210,7 @@ entry:
 define amdgpu_ps float @scratch_load_b16_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
 ; GCN-LABEL: scratch_load_b16_idxprom_range_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_b32 v0, v0, off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    scratch_load_u16 v0, v0, s0 offset:32 scale_offset
@@ -216,6 +230,7 @@ entry:
 define amdgpu_ps <2 x float> @scratch_load_b64_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
 ; GCN-LABEL: scratch_load_b64_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_b32 v0, v0, off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    scratch_load_b64 v[0:1], v0, s0 scale_offset
@@ -234,6 +249,7 @@ entry:
 define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
 ; GCN-LABEL: scratch_load_b96_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_b32 v0, v0, off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    scratch_load_b96 v[0:2], v0, s0 scale_offset
@@ -250,6 +266,7 @@ entry:
 define amdgpu_ps <3 x float> @scratch_load_b96_idxprom_range_ioffset(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
 ; GCN-LABEL: scratch_load_b96_idxprom_range_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_b32 v0, v0, off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    scratch_load_b96 v[0:2], v0, s0 offset:192 scale_offset
@@ -267,6 +284,7 @@ entry:
 define amdgpu_ps <4 x float> @scratch_load_b128_idxprom_range(ptr addrspace(5) align 4 inreg %p, ptr addrspace(5) align 4 %pp) {
 ; GCN-LABEL: scratch_load_b128_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    scratch_load_b32 v0, v0, off
 ; GCN-NEXT:    s_wait_loadcnt 0x0
 ; GCN-NEXT:    scratch_load_b128 v[0:3], v0, s0 scale_offset
@@ -283,6 +301,7 @@ entry:
 define amdgpu_ps void @scratch_store_b32_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: scratch_store_b32_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
 ; GCN-NEXT:    scratch_store_b32 v0, v1, s0 scale_offset
 ; GCN-NEXT:    s_endpgm
@@ -296,6 +315,7 @@ entry:
 define amdgpu_ps void @scratch_store_b16_idxprom(ptr addrspace(5) align 2 inreg %p, i32 %idx) {
 ; GCN-LABEL: scratch_store_b16_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1
 ; GCN-NEXT:    scratch_store_b16 v0, v1, s0 scale_offset
 ; GCN-NEXT:    s_endpgm
@@ -309,6 +329,7 @@ entry:
 define amdgpu_ps void @scratch_store_b64_idxprom(ptr addrspace(5) align 4 inreg %p, i32 %idx) {
 ; GCN-LABEL: scratch_store_b64_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    v_mov_b64_e32 v[2:3], 1.0
 ; GCN-NEXT:    scratch_store_b64 v0, v[2:3], s0 scale_offset
 ; GCN-NEXT:    s_endpgm

diff  --git a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
index e0ea08d276979..7b9faf47793ce 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-smem.ll
@@ -5,6 +5,7 @@
 define amdgpu_ps float @s_load_b32_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
 ; GCN-LABEL: s_load_b32_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
@@ -21,6 +22,7 @@ entry:
 define amdgpu_ps float @s_load_b32_idx32(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
 ; SDAG-LABEL: s_load_b32_idx32:
 ; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-NEXT:    s_ashr_i32 s3, s2, 31
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; SDAG-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
@@ -32,6 +34,7 @@ define amdgpu_ps float @s_load_b32_idx32(ptr addrspace(4) align 4 inreg %p, i32
 ;
 ; GISEL-LABEL: s_load_b32_idx32:
 ; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_ashr_i32 s3, s2, 31
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GISEL-NEXT:    s_lshl_b64 s[2:3], s[2:3], 2
@@ -50,6 +53,7 @@ entry:
 define amdgpu_ps float @s_load_b32_idxprom_wrong_stride(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
 ; SDAG-LABEL: s_load_b32_idxprom_wrong_stride:
 ; SDAG:       ; %bb.0: ; %entry
+; SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; SDAG-NEXT:    s_mov_b32 s3, 0
 ; SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; SDAG-NEXT:    s_lshl_b64 s[2:3], s[2:3], 3
@@ -61,6 +65,7 @@ define amdgpu_ps float @s_load_b32_idxprom_wrong_stride(ptr addrspace(4) align 4
 ;
 ; GISEL-LABEL: s_load_b32_idxprom_wrong_stride:
 ; GISEL:       ; %bb.0: ; %entry
+; GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GISEL-NEXT:    s_mov_b32 s3, 0
 ; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GISEL-NEXT:    s_lshl_b64 s[2:3], s[2:3], 3
@@ -80,6 +85,7 @@ entry:
 define amdgpu_ps float @s_load_b16_idxprom_ioffset(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
 ; GCN-LABEL: s_load_b16_idxprom_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_mov_b32_e32 v0, s0
@@ -97,6 +103,7 @@ entry:
 define amdgpu_ps <2 x float> @s_load_b64_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
 ; GCN-LABEL: s_load_b64_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b64 s[4:5], s[0:1], s2 offset:0x0 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
@@ -111,6 +118,7 @@ entry:
 define amdgpu_ps <3 x float> @s_load_b96_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
 ; GCN-LABEL: s_load_b96_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b96 s[4:6], s[0:1], s2 offset:0x0 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
@@ -126,6 +134,7 @@ entry:
 define amdgpu_ps <4 x float> @s_load_b128_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
 ; GCN-LABEL: s_load_b128_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b128 s[4:7], s[0:1], s2 offset:0x0 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
@@ -141,6 +150,7 @@ entry:
 define amdgpu_ps <8 x float> @s_load_b256_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
 ; GCN-LABEL: s_load_b256_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b256 s[4:11], s[0:1], s2 offset:0x0 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
@@ -158,6 +168,7 @@ entry:
 define amdgpu_ps <16 x float> @s_load_b512_idxprom(ptr addrspace(4) align 4 inreg %p, i32 inreg %idx) {
 ; GCN-LABEL: s_load_b512_idxprom:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b512 s[4:19], s[0:1], s2 offset:0x0 scale_offset
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
@@ -179,6 +190,7 @@ entry:
 define amdgpu_ps float @s_load_b32_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
 ; GCN-LABEL: s_load_b32_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_load_b32 s0, s[0:1], s2 offset:0x0 scale_offset
@@ -196,6 +208,7 @@ entry:
 define amdgpu_ps float @s_load_b32_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
 ; GCN-LABEL: s_load_b32_idxprom_range_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_load_b32 s0, s[0:1], s2 offset:0x40 scale_offset
@@ -216,6 +229,7 @@ entry:
 define amdgpu_ps float @s_load_b8_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
 ; GCN-LABEL: s_load_b8_idxprom_range_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_load_u8 s0, s[0:1], s2 offset:0x10
@@ -236,6 +250,7 @@ entry:
 define amdgpu_ps float @s_load_b16_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
 ; GCN-LABEL: s_load_b16_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_load_u16 s0, s[0:1], s2 offset:0x0 scale_offset
@@ -255,6 +270,7 @@ entry:
 define amdgpu_ps float @s_load_b16_idxprom_range_ioffset(ptr addrspace(4) align 4 inreg %p) {
 ; GCN-LABEL: s_load_b16_idxprom_range_ioffset:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_load_u16 s0, s[0:1], s2 offset:0x20 scale_offset
@@ -275,6 +291,7 @@ entry:
 define amdgpu_ps <2 x float> @s_load_b64_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
 ; GCN-LABEL: s_load_b64_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b32 s4, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_load_b64 s[2:3], s[0:1], s4 offset:0x0 scale_offset
@@ -292,6 +309,7 @@ entry:
 define amdgpu_ps <3 x float> @s_load_b96_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
 ; GCN-LABEL: s_load_b96_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_load_b96 s[4:6], s[0:1], s2 offset:0x0 scale_offset
@@ -310,6 +328,7 @@ entry:
 define amdgpu_ps <4 x float> @s_load_b128_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
 ; GCN-LABEL: s_load_b128_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_load_b128 s[4:7], s[0:1], s2 offset:0x0 scale_offset
@@ -328,6 +347,7 @@ entry:
 define amdgpu_ps <8 x float> @s_load_b256_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
 ; GCN-LABEL: s_load_b256_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_load_b256 s[4:11], s[0:1], s2 offset:0x0 scale_offset
@@ -348,6 +368,7 @@ entry:
 define amdgpu_ps <16 x float> @s_load_b512_idxprom_range(ptr addrspace(4) align 4 inreg %p) {
 ; GCN-LABEL: s_load_b512_idxprom_range:
 ; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GCN-NEXT:    s_load_b32 s2, s[0:1], 0x0
 ; GCN-NEXT:    s_wait_kmcnt 0x0
 ; GCN-NEXT:    s_load_b512 s[4:19], s[0:1], s2 offset:0x0 scale_offset

diff  --git a/llvm/test/CodeGen/AMDGPU/sub_u64.ll b/llvm/test/CodeGen/AMDGPU/sub_u64.ll
index f79fbd98f1e09..80dd796afb13c 100644
--- a/llvm/test/CodeGen/AMDGPU/sub_u64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sub_u64.ll
@@ -12,6 +12,7 @@ define amdgpu_ps <2 x float> @test_sub_u64_vv(i64 %a, i64 %b) {
 ;
 ; GFX1250-LABEL: test_sub_u64_vv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_sub_nc_u64_e32 v[0:1], v[0:1], v[2:3]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %sub = sub i64 %a, %b
@@ -29,6 +30,7 @@ define amdgpu_ps <2 x float> @test_sub_u64_vs(i64 %a, i64 inreg %b) {
 ;
 ; GFX1250-LABEL: test_sub_u64_vs:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_sub_nc_u64_e64 v[0:1], v[0:1], s[0:1]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %sub = sub i64 %a, %b
@@ -46,6 +48,7 @@ define amdgpu_ps <2 x float> @test_sub_u64_sv(i64 inreg %a, i64 %b) {
 ;
 ; GFX1250-LABEL: test_sub_u64_sv:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_sub_nc_u64_e32 v[0:1], s[0:1], v[0:1]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %sub = sub i64 %a, %b
@@ -54,12 +57,20 @@ define amdgpu_ps <2 x float> @test_sub_u64_sv(i64 inreg %a, i64 %b) {
 }
 
 define amdgpu_ps <2 x float> @test_sub_u64_ss(i64 inreg %a, i64 inreg %b) {
-; GCN-LABEL: test_sub_u64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_sub_nc_u64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX12-LABEL: test_sub_u64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_sub_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_ss:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_sub_nc_u64 s[0:1], s[0:1], s[2:3]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT:    ; return to shader part epilog
   %sub = sub i64 %a, %b
   %ret = bitcast i64 %sub to <2 x float>
   ret <2 x float> %ret
@@ -75,6 +86,7 @@ define amdgpu_ps <2 x float> @test_sub_u64_inline_lit_v(i64 %a) {
 ;
 ; GFX1250-LABEL: test_sub_u64_inline_lit_v:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_sub_nc_u64_e32 v[0:1], 5, v[0:1]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %sub = sub i64 5, %a
@@ -92,6 +104,7 @@ define amdgpu_ps <2 x float> @test_sub_u64_v_inline_lit(i64 %a) {
 ;
 ; GFX1250-LABEL: test_sub_u64_v_inline_lit:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_add_nc_u64_e32 v[0:1], -5, v[0:1]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %sub = sub i64 %a, 5
@@ -109,6 +122,7 @@ define amdgpu_ps <2 x float> @test_sub_u64_small_imm_v(i64 %a) {
 ;
 ; GFX1250-LABEL: test_sub_u64_small_imm_v:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_sub_nc_u64_e32 v[0:1], 0x1f4, v[0:1]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %sub = sub i64 500, %a
@@ -126,6 +140,7 @@ define amdgpu_ps <2 x float> @test_sub_u64_64bit_imm_v(i64 %a) {
 ;
 ; GFX1250-LABEL: test_sub_u64_64bit_imm_v:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    v_sub_nc_u64_e32 v[0:1], 0x13b9ac9ff, v[0:1]
 ; GFX1250-NEXT:    ; return to shader part epilog
   %sub = sub i64 5294967295, %a
@@ -134,13 +149,23 @@ define amdgpu_ps <2 x float> @test_sub_u64_64bit_imm_v(i64 %a) {
 }
 
 define amdgpu_ps <2 x float> @test_sub_u64_small_imm_s(i64 inreg %a) {
-; GCN-LABEL: test_sub_u64_small_imm_s:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_sub_nc_u64 s[0:1], 0x1f4, s[0:1]
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX12-LABEL: test_sub_u64_small_imm_s:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_sub_nc_u64 s[0:1], 0x1f4, s[0:1]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: test_sub_u64_small_imm_s:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-NEXT:    s_sub_nc_u64 s[0:1], 0x1f4, s[0:1]
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT:    ; return to shader part epilog
   %sub = sub i64 500, %a
   %ret = bitcast i64 %sub to <2 x float>
   ret <2 x float> %ret
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}

diff  --git a/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll b/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll
index 3d74b171400ac..44f82d4b36fc1 100644
--- a/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll
+++ b/llvm/test/CodeGen/AMDGPU/v_ashr_pk.ll
@@ -17,6 +17,7 @@ define amdgpu_kernel void @v_ashr_pk_i8_i32(ptr addrspace(1) %out, i32 %src0, i3
 ;
 ; GFX1250-LABEL: v_ashr_pk_i8_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x2c
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24
@@ -58,6 +59,7 @@ define amdgpu_kernel void @v_ashr_pk_u8_i32(ptr addrspace(1) %out, i32 %src0, i3
 ;
 ; GFX1250-LABEL: v_ashr_pk_u8_i32:
 ; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-NEXT:    s_clause 0x1
 ; GFX1250-NEXT:    s_load_b96 s[0:2], s[4:5], 0x2c
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x24

diff  --git a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
index 985bcbd6ff4f4..8c32a1a76a4db 100644
--- a/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
+++ b/llvm/test/CodeGen/AMDGPU/wait-before-stores-with-scope_sys.ll
@@ -12,12 +12,14 @@ define amdgpu_ps void @intrinsic_store_system_scope(i32 %val, <4 x i32> inreg %r
 ;
 ; GFX1250-SDAG-LABEL: intrinsic_store_system_scope:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    buffer_store_b32 v0, v[2:3], s[0:3], s4 idxen offen scope:SCOPE_SYS
 ; GFX1250-SDAG-NEXT:    s_endpgm
 ;
 ; GFX1250-GISEL-LABEL: intrinsic_store_system_scope:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    buffer_store_b32 v0, v[4:5], s[0:3], s4 idxen offen scope:SCOPE_SYS
 ; GFX1250-GISEL-NEXT:    s_endpgm
@@ -34,6 +36,7 @@ define amdgpu_ps void @generic_store_volatile(i32 %val, ptr addrspace(1) %out) {
 ;
 ; GFX1250-SDAG-LABEL: generic_store_volatile:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1
 ; GFX1250-SDAG-NEXT:    global_store_b32 v[2:3], v0, off scope:SCOPE_SYS
 ; GFX1250-SDAG-NEXT:    s_wait_storecnt 0x0
@@ -41,6 +44,7 @@ define amdgpu_ps void @generic_store_volatile(i32 %val, ptr addrspace(1) %out) {
 ;
 ; GFX1250-GISEL-LABEL: generic_store_volatile:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v2
 ; GFX1250-GISEL-NEXT:    global_store_b32 v[4:5], v0, off scope:SCOPE_SYS
 ; GFX1250-GISEL-NEXT:    s_wait_storecnt 0x0

diff  --git a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
index 75817105e74fd..49031e1d818f0 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-functions.ll
@@ -18,7 +18,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
@@ -27,7 +27,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
-; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; DAGISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
@@ -42,7 +42,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
 ; GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GISEL-NEXT:    s_mov_b32 exec_lo, -1
@@ -51,7 +51,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
-; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
@@ -66,7 +66,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
 ; DAGISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
-; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; DAGISEL64-NEXT:    s_mov_b64 exec, -1
@@ -76,7 +76,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; DAGISEL64-NEXT:    s_xor_b64 exec, vcc, -1
-; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32
 ; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; DAGISEL64-NEXT:    s_mov_b64 exec, vcc
@@ -91,7 +91,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
 ; GISEL64-NEXT:    s_wait_bvhcnt 0x0
 ; GISEL64-NEXT:    s_wait_kmcnt 0x0
 ; GISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
-; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v0, s32
 ; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GISEL64-NEXT:    s_mov_b64 exec, -1
@@ -101,7 +101,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
 ; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GISEL64-NEXT:    s_xor_b64 exec, vcc, -1
-; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v0, off, s32
 ; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GISEL64-NEXT:    s_mov_b64 exec, vcc
@@ -113,7 +113,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
 ; GFX1250-DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x1
+; GFX1250-DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
@@ -122,7 +122,7 @@ define amdgpu_gfx_whole_wave i32 @basic_test(i1 %active, i32 %a, i32 %b) {
 ; GFX1250-DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-DAGISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX1250-DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x1
+; GFX1250-DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
@@ -145,7 +145,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %
 ; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
@@ -154,7 +154,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %
 ; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
-; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; DAGISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
@@ -169,7 +169,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %
 ; GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GISEL-NEXT:    s_mov_b32 exec_lo, -1
@@ -178,7 +178,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
-; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
@@ -193,7 +193,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %
 ; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
 ; DAGISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
-; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; DAGISEL64-NEXT:    s_mov_b64 exec, -1
@@ -202,7 +202,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %
 ; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; DAGISEL64-NEXT:    s_xor_b64 exec, vcc, -1
-; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32
 ; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; DAGISEL64-NEXT:    s_mov_b64 exec, vcc
@@ -217,7 +217,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %
 ; GISEL64-NEXT:    s_wait_bvhcnt 0x0
 ; GISEL64-NEXT:    s_wait_kmcnt 0x0
 ; GISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
-; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v0, s32
 ; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GISEL64-NEXT:    s_mov_b64 exec, -1
@@ -226,7 +226,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %
 ; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GISEL64-NEXT:    s_xor_b64 exec, vcc, -1
-; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v0, off, s32
 ; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GISEL64-NEXT:    s_mov_b64 exec, vcc
@@ -238,7 +238,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %
 ; GFX1250-DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x1
+; GFX1250-DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
@@ -247,7 +247,7 @@ define amdgpu_gfx_whole_wave i32 @single_use_of_active(i1 %active, i32 %a, i32 %
 ; GFX1250-DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-DAGISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX1250-DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x1
+; GFX1250-DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
@@ -362,7 +362,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; DAGISEL-NEXT:    s_clause 0x3
+; DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v2, s32
 ; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32 offset:4
 ; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:8
@@ -383,7 +383,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; DAGISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
-; DAGISEL-NEXT:    s_clause 0x3
+; DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v2, off, s32
 ; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32 offset:4
 ; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:8
@@ -401,7 +401,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
 ; GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v2, s32
 ; GISEL-NEXT:    scratch_store_b32 off, v0, s32 offset:4
 ; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:8
@@ -422,7 +422,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
-; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v2, off, s32
 ; GISEL-NEXT:    scratch_load_b32 v0, off, s32 offset:4
 ; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:8
@@ -440,7 +440,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
 ; DAGISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
-; DAGISEL64-NEXT:    s_clause 0x3
+; DAGISEL64-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v2, s32
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32 offset:4
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:8
@@ -462,7 +462,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; DAGISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; DAGISEL64-NEXT:    s_xor_b64 exec, vcc, -1
-; DAGISEL64-NEXT:    s_clause 0x3
+; DAGISEL64-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v2, off, s32
 ; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32 offset:4
 ; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:8
@@ -480,7 +480,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
 ; GISEL64-NEXT:    s_wait_bvhcnt 0x0
 ; GISEL64-NEXT:    s_wait_kmcnt 0x0
 ; GISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
-; GISEL64-NEXT:    s_clause 0x3
+; GISEL64-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v2, s32
 ; GISEL64-NEXT:    scratch_store_b32 off, v0, s32 offset:4
 ; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:8
@@ -502,7 +502,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
 ; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GISEL64-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GISEL64-NEXT:    s_xor_b64 exec, vcc, -1
-; GISEL64-NEXT:    s_clause 0x3
+; GISEL64-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v2, off, s32
 ; GISEL64-NEXT:    scratch_load_b32 v0, off, s32 offset:4
 ; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:8
@@ -517,7 +517,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
 ; GFX1250-DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2, s32
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0, s32 offset:4
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:8
@@ -540,7 +540,7 @@ define amdgpu_gfx_whole_wave i32 @csr(i1 %active, i32 %a, i32 %b) {
 ; GFX1250-DAGISEL-NEXT:    v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2, off, s32
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0, off, s32 offset:4
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:8
@@ -906,7 +906,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
@@ -921,7 +921,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
-; DAGISEL-NEXT:    s_clause 0x1
+; DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; DAGISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
@@ -936,7 +936,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
 ; GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GISEL-NEXT:    s_mov_b32 exec_lo, -1
@@ -951,7 +951,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
 ; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
-; GISEL-NEXT:    s_clause 0x1
+; GISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GISEL-NEXT:    s_mov_b32 exec_lo, vcc_lo
@@ -966,7 +966,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
 ; DAGISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
-; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; DAGISEL64-NEXT:    s_mov_b64 exec, -1
@@ -981,7 +981,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
 ; DAGISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; DAGISEL64-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; DAGISEL64-NEXT:    s_xor_b64 exec, vcc, -1
-; DAGISEL64-NEXT:    s_clause 0x1
+; DAGISEL64-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32
 ; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; DAGISEL64-NEXT:    s_mov_b64 exec, vcc
@@ -996,7 +996,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
 ; GISEL64-NEXT:    s_wait_bvhcnt 0x0
 ; GISEL64-NEXT:    s_wait_kmcnt 0x0
 ; GISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
-; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v0, s32
 ; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GISEL64-NEXT:    s_mov_b64 exec, -1
@@ -1011,7 +1011,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
 ; GISEL64-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GISEL64-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc
 ; GISEL64-NEXT:    s_xor_b64 exec, vcc, -1
-; GISEL64-NEXT:    s_clause 0x1
+; GISEL64-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v0, off, s32
 ; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GISEL64-NEXT:    s_mov_b64 exec, vcc
@@ -1023,7 +1023,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
 ; GFX1250-DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x1
+; GFX1250-DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
@@ -1038,7 +1038,7 @@ define amdgpu_gfx_whole_wave i32 @multiple_blocks(i1 %active, i32 %a, i32 %b) {
 ; GFX1250-DAGISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-DAGISEL-NEXT:    v_cndmask_b32_e32 v0, v1, v0, vcc_lo
 ; GFX1250-DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x1
+; GFX1250-DAGISEL-NEXT:    s_clause 0x1 ; 8-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
@@ -1067,7 +1067,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
 ; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; DAGISEL-NEXT:    s_clause 0x3
+; DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; DAGISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
@@ -1080,7 +1080,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
 ; DAGISEL-NEXT:    v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; DAGISEL-NEXT:    v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
-; DAGISEL-NEXT:    s_clause 0x3
+; DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; DAGISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
@@ -1097,7 +1097,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
 ; GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
@@ -1110,7 +1110,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
 ; GISEL-NEXT:    v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GISEL-NEXT:    v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
-; GISEL-NEXT:    s_clause 0x3
+; GISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
@@ -1127,7 +1127,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
 ; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
 ; DAGISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
-; DAGISEL64-NEXT:    s_clause 0x3
+; DAGISEL64-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v2, s32 offset:8
@@ -1142,7 +1142,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
 ; DAGISEL64-NEXT:    v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; DAGISEL64-NEXT:    v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; DAGISEL64-NEXT:    s_xor_b64 exec, vcc, -1
-; DAGISEL64-NEXT:    s_clause 0x3
+; DAGISEL64-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32
 ; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; DAGISEL64-NEXT:    scratch_load_b32 v2, off, s32 offset:8
@@ -1159,7 +1159,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
 ; GISEL64-NEXT:    s_wait_bvhcnt 0x0
 ; GISEL64-NEXT:    s_wait_kmcnt 0x0
 ; GISEL64-NEXT:    s_xor_saveexec_b64 vcc, -1
-; GISEL64-NEXT:    s_clause 0x3
+; GISEL64-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v0, s32
 ; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GISEL64-NEXT:    scratch_store_b32 off, v2, s32 offset:8
@@ -1174,7 +1174,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
 ; GISEL64-NEXT:    v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GISEL64-NEXT:    v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GISEL64-NEXT:    s_xor_b64 exec, vcc, -1
-; GISEL64-NEXT:    s_clause 0x3
+; GISEL64-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v0, off, s32
 ; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GISEL64-NEXT:    scratch_load_b32 v2, off, s32 offset:8
@@ -1188,7 +1188,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
 ; GFX1250-DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_xor_saveexec_b32 vcc_lo, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
@@ -1201,7 +1201,7 @@ define amdgpu_gfx_whole_wave i64 @ret_64(i1 %active, i64 %a, i64 %b) {
 ; GFX1250-DAGISEL-NEXT:    v_mov_b32_dpp v0, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX1250-DAGISEL-NEXT:    v_mov_b32_dpp v1, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1
 ; GFX1250-DAGISEL-NEXT:    s_xor_b32 exec_lo, vcc_lo, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3 ; 16-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
@@ -1225,7 +1225,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i
 ; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
-; DAGISEL-NEXT:    s_clause 0x5
+; DAGISEL-NEXT:    s_clause 0x5 ; 24-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; DAGISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
@@ -1242,7 +1242,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i
 ; DAGISEL-NEXT:    scratch_store_b32 off, v5, s11
 ; DAGISEL-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
-; DAGISEL-NEXT:    s_clause 0x5
+; DAGISEL-NEXT:    s_clause 0x5 ; 24-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; DAGISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
@@ -1261,7 +1261,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i
 ; GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-NEXT:    s_xor_saveexec_b32 s34, -1
-; GISEL-NEXT:    s_clause 0x5
+; GISEL-NEXT:    s_clause 0x5 ; 24-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
@@ -1283,7 +1283,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i
 ; GISEL-NEXT:    scratch_store_b128 off, v[0:3], s11
 ; GISEL-NEXT:    scratch_store_b32 off, v5, s11
 ; GISEL-NEXT:    s_xor_b32 exec_lo, s34, -1
-; GISEL-NEXT:    s_clause 0x5
+; GISEL-NEXT:    s_clause 0x5 ; 24-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
@@ -1302,7 +1302,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i
 ; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
 ; DAGISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; DAGISEL64-NEXT:    s_clause 0x5
+; DAGISEL64-NEXT:    s_clause 0x5 ; 24-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v2, s32 offset:8
@@ -1322,7 +1322,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v5, s11
 ; DAGISEL64-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL64-NEXT:    s_xor_b64 exec, s[0:1], -1
-; DAGISEL64-NEXT:    s_clause 0x5
+; DAGISEL64-NEXT:    s_clause 0x5 ; 24-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32
 ; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; DAGISEL64-NEXT:    scratch_load_b32 v2, off, s32 offset:8
@@ -1341,7 +1341,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i
 ; GISEL64-NEXT:    s_wait_bvhcnt 0x0
 ; GISEL64-NEXT:    s_wait_kmcnt 0x0
 ; GISEL64-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GISEL64-NEXT:    s_clause 0x5
+; GISEL64-NEXT:    s_clause 0x5 ; 24-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v0, s32
 ; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GISEL64-NEXT:    scratch_store_b32 off, v2, s32 offset:8
@@ -1365,7 +1365,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i
 ; GISEL64-NEXT:    scratch_store_b128 off, v[0:3], s11
 ; GISEL64-NEXT:    scratch_store_b32 off, v5, s11
 ; GISEL64-NEXT:    s_xor_b64 exec, s[34:35], -1
-; GISEL64-NEXT:    s_clause 0x5
+; GISEL64-NEXT:    s_clause 0x5 ; 24-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v0, off, s32
 ; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GISEL64-NEXT:    scratch_load_b32 v2, off, s32 offset:8
@@ -1381,7 +1381,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i
 ; GFX1250-DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x5
+; GFX1250-DAGISEL-NEXT:    s_clause 0x5 ; 24-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
@@ -1399,7 +1399,7 @@ define amdgpu_gfx_whole_wave void @inreg_args(i1 %active, i32 inreg %i32, <4 x i
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v5, s11
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x5
+; GFX1250-DAGISEL-NEXT:    s_clause 0x5 ; 24-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
@@ -1429,7 +1429,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL-NEXT:    s_mov_b32 s0, s33
 ; DAGISEL-NEXT:    s_mov_b32 s33, s32
 ; DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v0, s33 offset:4
 ; DAGISEL-NEXT:    scratch_store_b32 off, v1, s33 offset:8
 ; DAGISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:12
@@ -1462,7 +1462,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL-NEXT:    scratch_store_b32 off, v29, s33 offset:120
 ; DAGISEL-NEXT:    scratch_store_b32 off, v30, s33 offset:124
 ; DAGISEL-NEXT:    scratch_store_b32 off, v31, s33 offset:128
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v32, s33 offset:132
 ; DAGISEL-NEXT:    scratch_store_b32 off, v33, s33 offset:136
 ; DAGISEL-NEXT:    scratch_store_b32 off, v34, s33 offset:140
@@ -1495,7 +1495,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL-NEXT:    scratch_store_b32 off, v85, s33 offset:248
 ; DAGISEL-NEXT:    scratch_store_b32 off, v86, s33 offset:252
 ; DAGISEL-NEXT:    scratch_store_b32 off, v87, s33 offset:256
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v96, s33 offset:260
 ; DAGISEL-NEXT:    scratch_store_b32 off, v97, s33 offset:264
 ; DAGISEL-NEXT:    scratch_store_b32 off, v98, s33 offset:268
@@ -1528,7 +1528,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL-NEXT:    scratch_store_b32 off, v149, s33 offset:376
 ; DAGISEL-NEXT:    scratch_store_b32 off, v150, s33 offset:380
 ; DAGISEL-NEXT:    scratch_store_b32 off, v151, s33 offset:384
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v160, s33 offset:388
 ; DAGISEL-NEXT:    scratch_store_b32 off, v161, s33 offset:392
 ; DAGISEL-NEXT:    scratch_store_b32 off, v162, s33 offset:396
@@ -1561,7 +1561,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL-NEXT:    scratch_store_b32 off, v213, s33 offset:504
 ; DAGISEL-NEXT:    scratch_store_b32 off, v214, s33 offset:508
 ; DAGISEL-NEXT:    scratch_store_b32 off, v215, s33 offset:512
-; DAGISEL-NEXT:    s_clause 0xf
+; DAGISEL-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v224, s33 offset:516
 ; DAGISEL-NEXT:    scratch_store_b32 off, v225, s33 offset:520
 ; DAGISEL-NEXT:    scratch_store_b32 off, v226, s33 offset:524
@@ -1600,7 +1600,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; DAGISEL-NEXT:    s_mov_b32 s32, s33
 ; DAGISEL-NEXT:    s_xor_b32 exec_lo, s4, -1
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v0, off, s33 offset:4
 ; DAGISEL-NEXT:    scratch_load_b32 v1, off, s33 offset:8
 ; DAGISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:12
@@ -1633,7 +1633,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL-NEXT:    scratch_load_b32 v29, off, s33 offset:120
 ; DAGISEL-NEXT:    scratch_load_b32 v30, off, s33 offset:124
 ; DAGISEL-NEXT:    scratch_load_b32 v31, off, s33 offset:128
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v32, off, s33 offset:132
 ; DAGISEL-NEXT:    scratch_load_b32 v33, off, s33 offset:136
 ; DAGISEL-NEXT:    scratch_load_b32 v34, off, s33 offset:140
@@ -1666,7 +1666,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL-NEXT:    scratch_load_b32 v85, off, s33 offset:248
 ; DAGISEL-NEXT:    scratch_load_b32 v86, off, s33 offset:252
 ; DAGISEL-NEXT:    scratch_load_b32 v87, off, s33 offset:256
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v96, off, s33 offset:260
 ; DAGISEL-NEXT:    scratch_load_b32 v97, off, s33 offset:264
 ; DAGISEL-NEXT:    scratch_load_b32 v98, off, s33 offset:268
@@ -1699,7 +1699,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL-NEXT:    scratch_load_b32 v149, off, s33 offset:376
 ; DAGISEL-NEXT:    scratch_load_b32 v150, off, s33 offset:380
 ; DAGISEL-NEXT:    scratch_load_b32 v151, off, s33 offset:384
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v160, off, s33 offset:388
 ; DAGISEL-NEXT:    scratch_load_b32 v161, off, s33 offset:392
 ; DAGISEL-NEXT:    scratch_load_b32 v162, off, s33 offset:396
@@ -1732,7 +1732,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL-NEXT:    scratch_load_b32 v213, off, s33 offset:504
 ; DAGISEL-NEXT:    scratch_load_b32 v214, off, s33 offset:508
 ; DAGISEL-NEXT:    scratch_load_b32 v215, off, s33 offset:512
-; DAGISEL-NEXT:    s_clause 0xf
+; DAGISEL-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v224, off, s33 offset:516
 ; DAGISEL-NEXT:    scratch_load_b32 v225, off, s33 offset:520
 ; DAGISEL-NEXT:    scratch_load_b32 v226, off, s33 offset:524
@@ -1765,7 +1765,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL-NEXT:    s_mov_b32 s0, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v0, s33 offset:4
 ; GISEL-NEXT:    scratch_store_b32 off, v1, s33 offset:8
 ; GISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:12
@@ -1798,7 +1798,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL-NEXT:    scratch_store_b32 off, v29, s33 offset:120
 ; GISEL-NEXT:    scratch_store_b32 off, v30, s33 offset:124
 ; GISEL-NEXT:    scratch_store_b32 off, v31, s33 offset:128
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v32, s33 offset:132
 ; GISEL-NEXT:    scratch_store_b32 off, v33, s33 offset:136
 ; GISEL-NEXT:    scratch_store_b32 off, v34, s33 offset:140
@@ -1831,7 +1831,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL-NEXT:    scratch_store_b32 off, v85, s33 offset:248
 ; GISEL-NEXT:    scratch_store_b32 off, v86, s33 offset:252
 ; GISEL-NEXT:    scratch_store_b32 off, v87, s33 offset:256
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v96, s33 offset:260
 ; GISEL-NEXT:    scratch_store_b32 off, v97, s33 offset:264
 ; GISEL-NEXT:    scratch_store_b32 off, v98, s33 offset:268
@@ -1864,7 +1864,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL-NEXT:    scratch_store_b32 off, v149, s33 offset:376
 ; GISEL-NEXT:    scratch_store_b32 off, v150, s33 offset:380
 ; GISEL-NEXT:    scratch_store_b32 off, v151, s33 offset:384
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v160, s33 offset:388
 ; GISEL-NEXT:    scratch_store_b32 off, v161, s33 offset:392
 ; GISEL-NEXT:    scratch_store_b32 off, v162, s33 offset:396
@@ -1897,7 +1897,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL-NEXT:    scratch_store_b32 off, v213, s33 offset:504
 ; GISEL-NEXT:    scratch_store_b32 off, v214, s33 offset:508
 ; GISEL-NEXT:    scratch_store_b32 off, v215, s33 offset:512
-; GISEL-NEXT:    s_clause 0xf
+; GISEL-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v224, s33 offset:516
 ; GISEL-NEXT:    scratch_store_b32 off, v225, s33 offset:520
 ; GISEL-NEXT:    scratch_store_b32 off, v226, s33 offset:524
@@ -1936,7 +1936,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_xor_b32 exec_lo, s4, -1
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v0, off, s33 offset:4
 ; GISEL-NEXT:    scratch_load_b32 v1, off, s33 offset:8
 ; GISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:12
@@ -1969,7 +1969,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL-NEXT:    scratch_load_b32 v29, off, s33 offset:120
 ; GISEL-NEXT:    scratch_load_b32 v30, off, s33 offset:124
 ; GISEL-NEXT:    scratch_load_b32 v31, off, s33 offset:128
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v32, off, s33 offset:132
 ; GISEL-NEXT:    scratch_load_b32 v33, off, s33 offset:136
 ; GISEL-NEXT:    scratch_load_b32 v34, off, s33 offset:140
@@ -2002,7 +2002,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL-NEXT:    scratch_load_b32 v85, off, s33 offset:248
 ; GISEL-NEXT:    scratch_load_b32 v86, off, s33 offset:252
 ; GISEL-NEXT:    scratch_load_b32 v87, off, s33 offset:256
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v96, off, s33 offset:260
 ; GISEL-NEXT:    scratch_load_b32 v97, off, s33 offset:264
 ; GISEL-NEXT:    scratch_load_b32 v98, off, s33 offset:268
@@ -2035,7 +2035,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL-NEXT:    scratch_load_b32 v149, off, s33 offset:376
 ; GISEL-NEXT:    scratch_load_b32 v150, off, s33 offset:380
 ; GISEL-NEXT:    scratch_load_b32 v151, off, s33 offset:384
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v160, off, s33 offset:388
 ; GISEL-NEXT:    scratch_load_b32 v161, off, s33 offset:392
 ; GISEL-NEXT:    scratch_load_b32 v162, off, s33 offset:396
@@ -2068,7 +2068,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL-NEXT:    scratch_load_b32 v213, off, s33 offset:504
 ; GISEL-NEXT:    scratch_load_b32 v214, off, s33 offset:508
 ; GISEL-NEXT:    scratch_load_b32 v215, off, s33 offset:512
-; GISEL-NEXT:    s_clause 0xf
+; GISEL-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v224, off, s33 offset:516
 ; GISEL-NEXT:    scratch_load_b32 v225, off, s33 offset:520
 ; GISEL-NEXT:    scratch_load_b32 v226, off, s33 offset:524
@@ -2101,7 +2101,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL64-NEXT:    s_mov_b32 s0, s33
 ; DAGISEL64-NEXT:    s_mov_b32 s33, s32
 ; DAGISEL64-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s33 offset:4
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s33 offset:8
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v2, s33 offset:12
@@ -2134,7 +2134,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v29, s33 offset:120
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v30, s33 offset:124
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v31, s33 offset:128
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v32, s33 offset:132
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v33, s33 offset:136
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v34, s33 offset:140
@@ -2167,7 +2167,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v85, s33 offset:248
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v86, s33 offset:252
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v87, s33 offset:256
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v96, s33 offset:260
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v97, s33 offset:264
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v98, s33 offset:268
@@ -2200,7 +2200,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v149, s33 offset:376
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v150, s33 offset:380
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v151, s33 offset:384
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v160, s33 offset:388
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v161, s33 offset:392
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v162, s33 offset:396
@@ -2233,7 +2233,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v213, s33 offset:504
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v214, s33 offset:508
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v215, s33 offset:512
-; DAGISEL64-NEXT:    s_clause 0xf
+; DAGISEL64-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v224, s33 offset:516
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v225, s33 offset:520
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v226, s33 offset:524
@@ -2274,7 +2274,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL64-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; DAGISEL64-NEXT:    s_mov_b32 s32, s33
 ; DAGISEL64-NEXT:    s_xor_b64 exec, s[4:5], -1
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s33 offset:4
 ; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s33 offset:8
 ; DAGISEL64-NEXT:    scratch_load_b32 v2, off, s33 offset:12
@@ -2307,7 +2307,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL64-NEXT:    scratch_load_b32 v29, off, s33 offset:120
 ; DAGISEL64-NEXT:    scratch_load_b32 v30, off, s33 offset:124
 ; DAGISEL64-NEXT:    scratch_load_b32 v31, off, s33 offset:128
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v32, off, s33 offset:132
 ; DAGISEL64-NEXT:    scratch_load_b32 v33, off, s33 offset:136
 ; DAGISEL64-NEXT:    scratch_load_b32 v34, off, s33 offset:140
@@ -2340,7 +2340,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL64-NEXT:    scratch_load_b32 v85, off, s33 offset:248
 ; DAGISEL64-NEXT:    scratch_load_b32 v86, off, s33 offset:252
 ; DAGISEL64-NEXT:    scratch_load_b32 v87, off, s33 offset:256
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v96, off, s33 offset:260
 ; DAGISEL64-NEXT:    scratch_load_b32 v97, off, s33 offset:264
 ; DAGISEL64-NEXT:    scratch_load_b32 v98, off, s33 offset:268
@@ -2373,7 +2373,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL64-NEXT:    scratch_load_b32 v149, off, s33 offset:376
 ; DAGISEL64-NEXT:    scratch_load_b32 v150, off, s33 offset:380
 ; DAGISEL64-NEXT:    scratch_load_b32 v151, off, s33 offset:384
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v160, off, s33 offset:388
 ; DAGISEL64-NEXT:    scratch_load_b32 v161, off, s33 offset:392
 ; DAGISEL64-NEXT:    scratch_load_b32 v162, off, s33 offset:396
@@ -2406,7 +2406,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; DAGISEL64-NEXT:    scratch_load_b32 v213, off, s33 offset:504
 ; DAGISEL64-NEXT:    scratch_load_b32 v214, off, s33 offset:508
 ; DAGISEL64-NEXT:    scratch_load_b32 v215, off, s33 offset:512
-; DAGISEL64-NEXT:    s_clause 0xf
+; DAGISEL64-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v224, off, s33 offset:516
 ; DAGISEL64-NEXT:    scratch_load_b32 v225, off, s33 offset:520
 ; DAGISEL64-NEXT:    scratch_load_b32 v226, off, s33 offset:524
@@ -2439,7 +2439,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL64-NEXT:    s_mov_b32 s0, s33
 ; GISEL64-NEXT:    s_mov_b32 s33, s32
 ; GISEL64-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v0, s33 offset:4
 ; GISEL64-NEXT:    scratch_store_b32 off, v1, s33 offset:8
 ; GISEL64-NEXT:    scratch_store_b32 off, v2, s33 offset:12
@@ -2472,7 +2472,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL64-NEXT:    scratch_store_b32 off, v29, s33 offset:120
 ; GISEL64-NEXT:    scratch_store_b32 off, v30, s33 offset:124
 ; GISEL64-NEXT:    scratch_store_b32 off, v31, s33 offset:128
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v32, s33 offset:132
 ; GISEL64-NEXT:    scratch_store_b32 off, v33, s33 offset:136
 ; GISEL64-NEXT:    scratch_store_b32 off, v34, s33 offset:140
@@ -2505,7 +2505,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL64-NEXT:    scratch_store_b32 off, v85, s33 offset:248
 ; GISEL64-NEXT:    scratch_store_b32 off, v86, s33 offset:252
 ; GISEL64-NEXT:    scratch_store_b32 off, v87, s33 offset:256
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v96, s33 offset:260
 ; GISEL64-NEXT:    scratch_store_b32 off, v97, s33 offset:264
 ; GISEL64-NEXT:    scratch_store_b32 off, v98, s33 offset:268
@@ -2538,7 +2538,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL64-NEXT:    scratch_store_b32 off, v149, s33 offset:376
 ; GISEL64-NEXT:    scratch_store_b32 off, v150, s33 offset:380
 ; GISEL64-NEXT:    scratch_store_b32 off, v151, s33 offset:384
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v160, s33 offset:388
 ; GISEL64-NEXT:    scratch_store_b32 off, v161, s33 offset:392
 ; GISEL64-NEXT:    scratch_store_b32 off, v162, s33 offset:396
@@ -2571,7 +2571,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL64-NEXT:    scratch_store_b32 off, v213, s33 offset:504
 ; GISEL64-NEXT:    scratch_store_b32 off, v214, s33 offset:508
 ; GISEL64-NEXT:    scratch_store_b32 off, v215, s33 offset:512
-; GISEL64-NEXT:    s_clause 0xf
+; GISEL64-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v224, s33 offset:516
 ; GISEL64-NEXT:    scratch_store_b32 off, v225, s33 offset:520
 ; GISEL64-NEXT:    scratch_store_b32 off, v226, s33 offset:524
@@ -2612,7 +2612,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL64-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
 ; GISEL64-NEXT:    s_mov_b32 s32, s33
 ; GISEL64-NEXT:    s_xor_b64 exec, s[4:5], -1
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v0, off, s33 offset:4
 ; GISEL64-NEXT:    scratch_load_b32 v1, off, s33 offset:8
 ; GISEL64-NEXT:    scratch_load_b32 v2, off, s33 offset:12
@@ -2645,7 +2645,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL64-NEXT:    scratch_load_b32 v29, off, s33 offset:120
 ; GISEL64-NEXT:    scratch_load_b32 v30, off, s33 offset:124
 ; GISEL64-NEXT:    scratch_load_b32 v31, off, s33 offset:128
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v32, off, s33 offset:132
 ; GISEL64-NEXT:    scratch_load_b32 v33, off, s33 offset:136
 ; GISEL64-NEXT:    scratch_load_b32 v34, off, s33 offset:140
@@ -2678,7 +2678,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL64-NEXT:    scratch_load_b32 v85, off, s33 offset:248
 ; GISEL64-NEXT:    scratch_load_b32 v86, off, s33 offset:252
 ; GISEL64-NEXT:    scratch_load_b32 v87, off, s33 offset:256
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v96, off, s33 offset:260
 ; GISEL64-NEXT:    scratch_load_b32 v97, off, s33 offset:264
 ; GISEL64-NEXT:    scratch_load_b32 v98, off, s33 offset:268
@@ -2711,7 +2711,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL64-NEXT:    scratch_load_b32 v149, off, s33 offset:376
 ; GISEL64-NEXT:    scratch_load_b32 v150, off, s33 offset:380
 ; GISEL64-NEXT:    scratch_load_b32 v151, off, s33 offset:384
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v160, off, s33 offset:388
 ; GISEL64-NEXT:    scratch_load_b32 v161, off, s33 offset:392
 ; GISEL64-NEXT:    scratch_load_b32 v162, off, s33 offset:396
@@ -2744,7 +2744,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GISEL64-NEXT:    scratch_load_b32 v213, off, s33 offset:504
 ; GISEL64-NEXT:    scratch_load_b32 v214, off, s33 offset:508
 ; GISEL64-NEXT:    scratch_load_b32 v215, off, s33 offset:512
-; GISEL64-NEXT:    s_clause 0xf
+; GISEL64-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v224, off, s33 offset:516
 ; GISEL64-NEXT:    scratch_load_b32 v225, off, s33 offset:520
 ; GISEL64-NEXT:    scratch_load_b32 v226, off, s33 offset:524
@@ -2774,7 +2774,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 s0, s33
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 s33, s32
 ; GFX1250-DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0, s33 offset:4
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1, s33 offset:8
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:12
@@ -2838,7 +2838,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v84, s33 offset:244
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v85, s33 offset:248
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v86, s33 offset:252
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v87, s33 offset:256
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v96, s33 offset:260
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v97, s33 offset:264
@@ -2902,7 +2902,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v211, s33 offset:496
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v212, s33 offset:500
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v213, s33 offset:504
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v214, s33 offset:508
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v215, s33 offset:512
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v224, s33 offset:516
@@ -2967,7 +2967,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v42 /*v298*/, s33 offset:748
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v43 /*v299*/, s33 offset:752
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v44 /*v300*/, s33 offset:756
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v45 /*v301*/, s33 offset:760
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v46 /*v302*/, s33 offset:764
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v47 /*v303*/, s33 offset:768
@@ -3031,7 +3031,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v105 /*v361*/, s33 offset:1000
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v106 /*v362*/, s33 offset:1004
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v107 /*v363*/, s33 offset:1008
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v108 /*v364*/, s33 offset:1012
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v109 /*v365*/, s33 offset:1016
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v110 /*v366*/, s33 offset:1020
@@ -3095,7 +3095,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v168 /*v424*/, s33 offset:1252
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v169 /*v425*/, s33 offset:1256
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v170 /*v426*/, s33 offset:1260
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v171 /*v427*/, s33 offset:1264
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v172 /*v428*/, s33 offset:1268
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v173 /*v429*/, s33 offset:1272
@@ -3159,7 +3159,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v231 /*v487*/, s33 offset:1504
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v232 /*v488*/, s33 offset:1508
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v233 /*v489*/, s33 offset:1512
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v234 /*v490*/, s33 offset:1516
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v235 /*v491*/, s33 offset:1520
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v236 /*v492*/, s33 offset:1524
@@ -3224,7 +3224,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v38 /*v550*/, s33 offset:1756
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v39 /*v551*/, s33 offset:1760
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v40 /*v552*/, s33 offset:1764
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v41 /*v553*/, s33 offset:1768
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v42 /*v554*/, s33 offset:1772
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v43 /*v555*/, s33 offset:1776
@@ -3288,7 +3288,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v101 /*v613*/, s33 offset:2008
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v102 /*v614*/, s33 offset:2012
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v103 /*v615*/, s33 offset:2016
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v104 /*v616*/, s33 offset:2020
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v105 /*v617*/, s33 offset:2024
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v106 /*v618*/, s33 offset:2028
@@ -3352,7 +3352,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v164 /*v676*/, s33 offset:2260
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v165 /*v677*/, s33 offset:2264
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v166 /*v678*/, s33 offset:2268
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v167 /*v679*/, s33 offset:2272
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v168 /*v680*/, s33 offset:2276
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v169 /*v681*/, s33 offset:2280
@@ -3416,7 +3416,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v227 /*v739*/, s33 offset:2512
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v228 /*v740*/, s33 offset:2516
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v229 /*v741*/, s33 offset:2520
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v230 /*v742*/, s33 offset:2524
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v231 /*v743*/, s33 offset:2528
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v232 /*v744*/, s33 offset:2532
@@ -3481,7 +3481,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v34 /*v802*/, s33 offset:2764
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v35 /*v803*/, s33 offset:2768
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v36 /*v804*/, s33 offset:2772
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v37 /*v805*/, s33 offset:2776
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v38 /*v806*/, s33 offset:2780
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v39 /*v807*/, s33 offset:2784
@@ -3545,7 +3545,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v97 /*v865*/, s33 offset:3016
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v98 /*v866*/, s33 offset:3020
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v99 /*v867*/, s33 offset:3024
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v100 /*v868*/, s33 offset:3028
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v101 /*v869*/, s33 offset:3032
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v102 /*v870*/, s33 offset:3036
@@ -3609,7 +3609,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v160 /*v928*/, s33 offset:3268
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v161 /*v929*/, s33 offset:3272
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v162 /*v930*/, s33 offset:3276
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v163 /*v931*/, s33 offset:3280
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v164 /*v932*/, s33 offset:3284
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v165 /*v933*/, s33 offset:3288
@@ -3673,7 +3673,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v223 /*v991*/, s33 offset:3520
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v224 /*v992*/, s33 offset:3524
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v225 /*v993*/, s33 offset:3528
-; GFX1250-DAGISEL-NEXT:    s_clause 0x1d
+; GFX1250-DAGISEL-NEXT:    s_clause 0x1d ; 120-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v226 /*v994*/, s33 offset:3532
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v227 /*v995*/, s33 offset:3536
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v228 /*v996*/, s33 offset:3540
@@ -3727,7 +3727,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_xor_b32 exec_lo, s4, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0, off, s33 offset:4
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1, off, s33 offset:8
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:12
@@ -3791,7 +3791,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v84, off, s33 offset:244
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v85, off, s33 offset:248
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v86, off, s33 offset:252
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v87, off, s33 offset:256
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v96, off, s33 offset:260
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v97, off, s33 offset:264
@@ -3855,7 +3855,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v211, off, s33 offset:496
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v212, off, s33 offset:500
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v213, off, s33 offset:504
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v214, off, s33 offset:508
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v215, off, s33 offset:512
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v224, off, s33 offset:516
@@ -3920,7 +3920,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v42 /*v298*/, off, s33 offset:748
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v43 /*v299*/, off, s33 offset:752
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v44 /*v300*/, off, s33 offset:756
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v45 /*v301*/, off, s33 offset:760
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v46 /*v302*/, off, s33 offset:764
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v47 /*v303*/, off, s33 offset:768
@@ -3984,7 +3984,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v105 /*v361*/, off, s33 offset:1000
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v106 /*v362*/, off, s33 offset:1004
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v107 /*v363*/, off, s33 offset:1008
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v108 /*v364*/, off, s33 offset:1012
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v109 /*v365*/, off, s33 offset:1016
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v110 /*v366*/, off, s33 offset:1020
@@ -4048,7 +4048,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v168 /*v424*/, off, s33 offset:1252
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v169 /*v425*/, off, s33 offset:1256
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v170 /*v426*/, off, s33 offset:1260
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v171 /*v427*/, off, s33 offset:1264
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v172 /*v428*/, off, s33 offset:1268
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v173 /*v429*/, off, s33 offset:1272
@@ -4112,7 +4112,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v231 /*v487*/, off, s33 offset:1504
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v232 /*v488*/, off, s33 offset:1508
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v233 /*v489*/, off, s33 offset:1512
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v234 /*v490*/, off, s33 offset:1516
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v235 /*v491*/, off, s33 offset:1520
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v236 /*v492*/, off, s33 offset:1524
@@ -4177,7 +4177,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v38 /*v550*/, off, s33 offset:1756
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v39 /*v551*/, off, s33 offset:1760
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v40 /*v552*/, off, s33 offset:1764
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v41 /*v553*/, off, s33 offset:1768
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v42 /*v554*/, off, s33 offset:1772
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v43 /*v555*/, off, s33 offset:1776
@@ -4241,7 +4241,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v101 /*v613*/, off, s33 offset:2008
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v102 /*v614*/, off, s33 offset:2012
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v103 /*v615*/, off, s33 offset:2016
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v104 /*v616*/, off, s33 offset:2020
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v105 /*v617*/, off, s33 offset:2024
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v106 /*v618*/, off, s33 offset:2028
@@ -4305,7 +4305,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v164 /*v676*/, off, s33 offset:2260
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v165 /*v677*/, off, s33 offset:2264
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v166 /*v678*/, off, s33 offset:2268
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v167 /*v679*/, off, s33 offset:2272
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v168 /*v680*/, off, s33 offset:2276
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v169 /*v681*/, off, s33 offset:2280
@@ -4369,7 +4369,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v227 /*v739*/, off, s33 offset:2512
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v228 /*v740*/, off, s33 offset:2516
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v229 /*v741*/, off, s33 offset:2520
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v230 /*v742*/, off, s33 offset:2524
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v231 /*v743*/, off, s33 offset:2528
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v232 /*v744*/, off, s33 offset:2532
@@ -4434,7 +4434,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v34 /*v802*/, off, s33 offset:2764
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v35 /*v803*/, off, s33 offset:2768
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v36 /*v804*/, off, s33 offset:2772
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v37 /*v805*/, off, s33 offset:2776
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v38 /*v806*/, off, s33 offset:2780
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v39 /*v807*/, off, s33 offset:2784
@@ -4498,7 +4498,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v97 /*v865*/, off, s33 offset:3016
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v98 /*v866*/, off, s33 offset:3020
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v99 /*v867*/, off, s33 offset:3024
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v100 /*v868*/, off, s33 offset:3028
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v101 /*v869*/, off, s33 offset:3032
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v102 /*v870*/, off, s33 offset:3036
@@ -4562,7 +4562,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v160 /*v928*/, off, s33 offset:3268
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v161 /*v929*/, off, s33 offset:3272
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v162 /*v930*/, off, s33 offset:3276
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v163 /*v931*/, off, s33 offset:3280
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v164 /*v932*/, off, s33 offset:3284
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v165 /*v933*/, off, s33 offset:3288
@@ -4626,7 +4626,7 @@ define amdgpu_gfx_whole_wave <2 x half> @call_gfx_from_whole_wave(i1 %active, <2
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v223 /*v991*/, off, s33 offset:3520
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v224 /*v992*/, off, s33 offset:3524
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v225 /*v993*/, off, s33 offset:3528
-; GFX1250-DAGISEL-NEXT:    s_clause 0x1d
+; GFX1250-DAGISEL-NEXT:    s_clause 0x1d ; 120-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v226 /*v994*/, off, s33 offset:3532
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v227 /*v995*/, off, s33 offset:3536
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v228 /*v996*/, off, s33 offset:3540
@@ -4677,7 +4677,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; DAGISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
@@ -4710,7 +4710,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL-NEXT:    scratch_store_b32 off, v29, s32 offset:116
 ; DAGISEL-NEXT:    scratch_store_b32 off, v30, s32 offset:120
 ; DAGISEL-NEXT:    scratch_store_b32 off, v31, s32 offset:124
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v32, s32 offset:128
 ; DAGISEL-NEXT:    scratch_store_b32 off, v33, s32 offset:132
 ; DAGISEL-NEXT:    scratch_store_b32 off, v34, s32 offset:136
@@ -4743,7 +4743,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL-NEXT:    scratch_store_b32 off, v85, s32 offset:244
 ; DAGISEL-NEXT:    scratch_store_b32 off, v86, s32 offset:248
 ; DAGISEL-NEXT:    scratch_store_b32 off, v87, s32 offset:252
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v96, s32 offset:256
 ; DAGISEL-NEXT:    scratch_store_b32 off, v97, s32 offset:260
 ; DAGISEL-NEXT:    scratch_store_b32 off, v98, s32 offset:264
@@ -4776,7 +4776,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL-NEXT:    scratch_store_b32 off, v149, s32 offset:372
 ; DAGISEL-NEXT:    scratch_store_b32 off, v150, s32 offset:376
 ; DAGISEL-NEXT:    scratch_store_b32 off, v151, s32 offset:380
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v160, s32 offset:384
 ; DAGISEL-NEXT:    scratch_store_b32 off, v161, s32 offset:388
 ; DAGISEL-NEXT:    scratch_store_b32 off, v162, s32 offset:392
@@ -4809,7 +4809,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL-NEXT:    scratch_store_b32 off, v213, s32 offset:500
 ; DAGISEL-NEXT:    scratch_store_b32 off, v214, s32 offset:504
 ; DAGISEL-NEXT:    scratch_store_b32 off, v215, s32 offset:508
-; DAGISEL-NEXT:    s_clause 0xf
+; DAGISEL-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v224, s32 offset:512
 ; DAGISEL-NEXT:    scratch_store_b32 off, v225, s32 offset:516
 ; DAGISEL-NEXT:    scratch_store_b32 off, v226, s32 offset:520
@@ -4833,7 +4833,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL-NEXT:    v_swap_b32 v0, v1
 ; DAGISEL-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; DAGISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
@@ -4866,7 +4866,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL-NEXT:    scratch_load_b32 v29, off, s32 offset:116
 ; DAGISEL-NEXT:    scratch_load_b32 v30, off, s32 offset:120
 ; DAGISEL-NEXT:    scratch_load_b32 v31, off, s32 offset:124
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v32, off, s32 offset:128
 ; DAGISEL-NEXT:    scratch_load_b32 v33, off, s32 offset:132
 ; DAGISEL-NEXT:    scratch_load_b32 v34, off, s32 offset:136
@@ -4899,7 +4899,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL-NEXT:    scratch_load_b32 v85, off, s32 offset:244
 ; DAGISEL-NEXT:    scratch_load_b32 v86, off, s32 offset:248
 ; DAGISEL-NEXT:    scratch_load_b32 v87, off, s32 offset:252
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v96, off, s32 offset:256
 ; DAGISEL-NEXT:    scratch_load_b32 v97, off, s32 offset:260
 ; DAGISEL-NEXT:    scratch_load_b32 v98, off, s32 offset:264
@@ -4932,7 +4932,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL-NEXT:    scratch_load_b32 v149, off, s32 offset:372
 ; DAGISEL-NEXT:    scratch_load_b32 v150, off, s32 offset:376
 ; DAGISEL-NEXT:    scratch_load_b32 v151, off, s32 offset:380
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v160, off, s32 offset:384
 ; DAGISEL-NEXT:    scratch_load_b32 v161, off, s32 offset:388
 ; DAGISEL-NEXT:    scratch_load_b32 v162, off, s32 offset:392
@@ -4965,7 +4965,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL-NEXT:    scratch_load_b32 v213, off, s32 offset:500
 ; DAGISEL-NEXT:    scratch_load_b32 v214, off, s32 offset:504
 ; DAGISEL-NEXT:    scratch_load_b32 v215, off, s32 offset:508
-; DAGISEL-NEXT:    s_clause 0xf
+; DAGISEL-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v224, off, s32 offset:512
 ; DAGISEL-NEXT:    scratch_load_b32 v225, off, s32 offset:516
 ; DAGISEL-NEXT:    scratch_load_b32 v226, off, s32 offset:520
@@ -4993,7 +4993,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL-NEXT:    s_wait_bvhcnt 0x0
 ; GISEL-NEXT:    s_wait_kmcnt 0x0
 ; GISEL-NEXT:    s_xor_saveexec_b32 s0, -1
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; GISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
@@ -5026,7 +5026,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL-NEXT:    scratch_store_b32 off, v29, s32 offset:116
 ; GISEL-NEXT:    scratch_store_b32 off, v30, s32 offset:120
 ; GISEL-NEXT:    scratch_store_b32 off, v31, s32 offset:124
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v32, s32 offset:128
 ; GISEL-NEXT:    scratch_store_b32 off, v33, s32 offset:132
 ; GISEL-NEXT:    scratch_store_b32 off, v34, s32 offset:136
@@ -5059,7 +5059,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL-NEXT:    scratch_store_b32 off, v85, s32 offset:244
 ; GISEL-NEXT:    scratch_store_b32 off, v86, s32 offset:248
 ; GISEL-NEXT:    scratch_store_b32 off, v87, s32 offset:252
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v96, s32 offset:256
 ; GISEL-NEXT:    scratch_store_b32 off, v97, s32 offset:260
 ; GISEL-NEXT:    scratch_store_b32 off, v98, s32 offset:264
@@ -5092,7 +5092,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL-NEXT:    scratch_store_b32 off, v149, s32 offset:372
 ; GISEL-NEXT:    scratch_store_b32 off, v150, s32 offset:376
 ; GISEL-NEXT:    scratch_store_b32 off, v151, s32 offset:380
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v160, s32 offset:384
 ; GISEL-NEXT:    scratch_store_b32 off, v161, s32 offset:388
 ; GISEL-NEXT:    scratch_store_b32 off, v162, s32 offset:392
@@ -5125,7 +5125,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL-NEXT:    scratch_store_b32 off, v213, s32 offset:500
 ; GISEL-NEXT:    scratch_store_b32 off, v214, s32 offset:504
 ; GISEL-NEXT:    scratch_store_b32 off, v215, s32 offset:508
-; GISEL-NEXT:    s_clause 0xf
+; GISEL-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v224, s32 offset:512
 ; GISEL-NEXT:    scratch_store_b32 off, v225, s32 offset:516
 ; GISEL-NEXT:    scratch_store_b32 off, v226, s32 offset:520
@@ -5149,7 +5149,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL-NEXT:    s_mov_b32 s37, gfx_callee at abs32@hi
 ; GISEL-NEXT:    s_wait_alu 0xfffe
 ; GISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; GISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
@@ -5182,7 +5182,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL-NEXT:    scratch_load_b32 v29, off, s32 offset:116
 ; GISEL-NEXT:    scratch_load_b32 v30, off, s32 offset:120
 ; GISEL-NEXT:    scratch_load_b32 v31, off, s32 offset:124
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v32, off, s32 offset:128
 ; GISEL-NEXT:    scratch_load_b32 v33, off, s32 offset:132
 ; GISEL-NEXT:    scratch_load_b32 v34, off, s32 offset:136
@@ -5215,7 +5215,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL-NEXT:    scratch_load_b32 v85, off, s32 offset:244
 ; GISEL-NEXT:    scratch_load_b32 v86, off, s32 offset:248
 ; GISEL-NEXT:    scratch_load_b32 v87, off, s32 offset:252
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v96, off, s32 offset:256
 ; GISEL-NEXT:    scratch_load_b32 v97, off, s32 offset:260
 ; GISEL-NEXT:    scratch_load_b32 v98, off, s32 offset:264
@@ -5248,7 +5248,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL-NEXT:    scratch_load_b32 v149, off, s32 offset:372
 ; GISEL-NEXT:    scratch_load_b32 v150, off, s32 offset:376
 ; GISEL-NEXT:    scratch_load_b32 v151, off, s32 offset:380
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v160, off, s32 offset:384
 ; GISEL-NEXT:    scratch_load_b32 v161, off, s32 offset:388
 ; GISEL-NEXT:    scratch_load_b32 v162, off, s32 offset:392
@@ -5281,7 +5281,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL-NEXT:    scratch_load_b32 v213, off, s32 offset:500
 ; GISEL-NEXT:    scratch_load_b32 v214, off, s32 offset:504
 ; GISEL-NEXT:    scratch_load_b32 v215, off, s32 offset:508
-; GISEL-NEXT:    s_clause 0xf
+; GISEL-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v224, off, s32 offset:512
 ; GISEL-NEXT:    scratch_load_b32 v225, off, s32 offset:516
 ; GISEL-NEXT:    scratch_load_b32 v226, off, s32 offset:520
@@ -5309,7 +5309,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL64-NEXT:    s_wait_bvhcnt 0x0
 ; DAGISEL64-NEXT:    s_wait_kmcnt 0x0
 ; DAGISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s32
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v2, s32 offset:8
@@ -5342,7 +5342,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v29, s32 offset:116
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v30, s32 offset:120
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v31, s32 offset:124
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v32, s32 offset:128
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v33, s32 offset:132
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v34, s32 offset:136
@@ -5375,7 +5375,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v85, s32 offset:244
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v86, s32 offset:248
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v87, s32 offset:252
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v96, s32 offset:256
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v97, s32 offset:260
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v98, s32 offset:264
@@ -5408,7 +5408,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v149, s32 offset:372
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v150, s32 offset:376
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v151, s32 offset:380
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v160, s32 offset:384
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v161, s32 offset:388
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v162, s32 offset:392
@@ -5441,7 +5441,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v213, s32 offset:500
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v214, s32 offset:504
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v215, s32 offset:508
-; DAGISEL64-NEXT:    s_clause 0xf
+; DAGISEL64-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v224, s32 offset:512
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v225, s32 offset:516
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v226, s32 offset:520
@@ -5465,7 +5465,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL64-NEXT:    v_swap_b32 v0, v1
 ; DAGISEL64-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL64-NEXT:    s_xor_b64 exec, s[0:1], -1
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s32
 ; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; DAGISEL64-NEXT:    scratch_load_b32 v2, off, s32 offset:8
@@ -5498,7 +5498,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL64-NEXT:    scratch_load_b32 v29, off, s32 offset:116
 ; DAGISEL64-NEXT:    scratch_load_b32 v30, off, s32 offset:120
 ; DAGISEL64-NEXT:    scratch_load_b32 v31, off, s32 offset:124
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v32, off, s32 offset:128
 ; DAGISEL64-NEXT:    scratch_load_b32 v33, off, s32 offset:132
 ; DAGISEL64-NEXT:    scratch_load_b32 v34, off, s32 offset:136
@@ -5531,7 +5531,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL64-NEXT:    scratch_load_b32 v85, off, s32 offset:244
 ; DAGISEL64-NEXT:    scratch_load_b32 v86, off, s32 offset:248
 ; DAGISEL64-NEXT:    scratch_load_b32 v87, off, s32 offset:252
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v96, off, s32 offset:256
 ; DAGISEL64-NEXT:    scratch_load_b32 v97, off, s32 offset:260
 ; DAGISEL64-NEXT:    scratch_load_b32 v98, off, s32 offset:264
@@ -5564,7 +5564,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL64-NEXT:    scratch_load_b32 v149, off, s32 offset:372
 ; DAGISEL64-NEXT:    scratch_load_b32 v150, off, s32 offset:376
 ; DAGISEL64-NEXT:    scratch_load_b32 v151, off, s32 offset:380
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v160, off, s32 offset:384
 ; DAGISEL64-NEXT:    scratch_load_b32 v161, off, s32 offset:388
 ; DAGISEL64-NEXT:    scratch_load_b32 v162, off, s32 offset:392
@@ -5597,7 +5597,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; DAGISEL64-NEXT:    scratch_load_b32 v213, off, s32 offset:500
 ; DAGISEL64-NEXT:    scratch_load_b32 v214, off, s32 offset:504
 ; DAGISEL64-NEXT:    scratch_load_b32 v215, off, s32 offset:508
-; DAGISEL64-NEXT:    s_clause 0xf
+; DAGISEL64-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v224, off, s32 offset:512
 ; DAGISEL64-NEXT:    scratch_load_b32 v225, off, s32 offset:516
 ; DAGISEL64-NEXT:    scratch_load_b32 v226, off, s32 offset:520
@@ -5625,7 +5625,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL64-NEXT:    s_wait_bvhcnt 0x0
 ; GISEL64-NEXT:    s_wait_kmcnt 0x0
 ; GISEL64-NEXT:    s_xor_saveexec_b64 s[0:1], -1
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v0, s32
 ; GISEL64-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GISEL64-NEXT:    scratch_store_b32 off, v2, s32 offset:8
@@ -5658,7 +5658,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL64-NEXT:    scratch_store_b32 off, v29, s32 offset:116
 ; GISEL64-NEXT:    scratch_store_b32 off, v30, s32 offset:120
 ; GISEL64-NEXT:    scratch_store_b32 off, v31, s32 offset:124
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v32, s32 offset:128
 ; GISEL64-NEXT:    scratch_store_b32 off, v33, s32 offset:132
 ; GISEL64-NEXT:    scratch_store_b32 off, v34, s32 offset:136
@@ -5691,7 +5691,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL64-NEXT:    scratch_store_b32 off, v85, s32 offset:244
 ; GISEL64-NEXT:    scratch_store_b32 off, v86, s32 offset:248
 ; GISEL64-NEXT:    scratch_store_b32 off, v87, s32 offset:252
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v96, s32 offset:256
 ; GISEL64-NEXT:    scratch_store_b32 off, v97, s32 offset:260
 ; GISEL64-NEXT:    scratch_store_b32 off, v98, s32 offset:264
@@ -5724,7 +5724,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL64-NEXT:    scratch_store_b32 off, v149, s32 offset:372
 ; GISEL64-NEXT:    scratch_store_b32 off, v150, s32 offset:376
 ; GISEL64-NEXT:    scratch_store_b32 off, v151, s32 offset:380
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v160, s32 offset:384
 ; GISEL64-NEXT:    scratch_store_b32 off, v161, s32 offset:388
 ; GISEL64-NEXT:    scratch_store_b32 off, v162, s32 offset:392
@@ -5757,7 +5757,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL64-NEXT:    scratch_store_b32 off, v213, s32 offset:500
 ; GISEL64-NEXT:    scratch_store_b32 off, v214, s32 offset:504
 ; GISEL64-NEXT:    scratch_store_b32 off, v215, s32 offset:508
-; GISEL64-NEXT:    s_clause 0xf
+; GISEL64-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v224, s32 offset:512
 ; GISEL64-NEXT:    scratch_store_b32 off, v225, s32 offset:516
 ; GISEL64-NEXT:    scratch_store_b32 off, v226, s32 offset:520
@@ -5781,7 +5781,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL64-NEXT:    s_mov_b32 s37, gfx_callee at abs32@hi
 ; GISEL64-NEXT:    s_wait_alu 0xfffe
 ; GISEL64-NEXT:    s_xor_b64 exec, s[0:1], -1
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v0, off, s32
 ; GISEL64-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GISEL64-NEXT:    scratch_load_b32 v2, off, s32 offset:8
@@ -5814,7 +5814,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL64-NEXT:    scratch_load_b32 v29, off, s32 offset:116
 ; GISEL64-NEXT:    scratch_load_b32 v30, off, s32 offset:120
 ; GISEL64-NEXT:    scratch_load_b32 v31, off, s32 offset:124
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v32, off, s32 offset:128
 ; GISEL64-NEXT:    scratch_load_b32 v33, off, s32 offset:132
 ; GISEL64-NEXT:    scratch_load_b32 v34, off, s32 offset:136
@@ -5847,7 +5847,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL64-NEXT:    scratch_load_b32 v85, off, s32 offset:244
 ; GISEL64-NEXT:    scratch_load_b32 v86, off, s32 offset:248
 ; GISEL64-NEXT:    scratch_load_b32 v87, off, s32 offset:252
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v96, off, s32 offset:256
 ; GISEL64-NEXT:    scratch_load_b32 v97, off, s32 offset:260
 ; GISEL64-NEXT:    scratch_load_b32 v98, off, s32 offset:264
@@ -5880,7 +5880,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL64-NEXT:    scratch_load_b32 v149, off, s32 offset:372
 ; GISEL64-NEXT:    scratch_load_b32 v150, off, s32 offset:376
 ; GISEL64-NEXT:    scratch_load_b32 v151, off, s32 offset:380
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v160, off, s32 offset:384
 ; GISEL64-NEXT:    scratch_load_b32 v161, off, s32 offset:388
 ; GISEL64-NEXT:    scratch_load_b32 v162, off, s32 offset:392
@@ -5913,7 +5913,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GISEL64-NEXT:    scratch_load_b32 v213, off, s32 offset:500
 ; GISEL64-NEXT:    scratch_load_b32 v214, off, s32 offset:504
 ; GISEL64-NEXT:    scratch_load_b32 v215, off, s32 offset:508
-; GISEL64-NEXT:    s_clause 0xf
+; GISEL64-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v224, off, s32 offset:512
 ; GISEL64-NEXT:    scratch_load_b32 v225, off, s32 offset:516
 ; GISEL64-NEXT:    scratch_load_b32 v226, off, s32 offset:520
@@ -5938,7 +5938,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0, s32
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1, s32 offset:4
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2, s32 offset:8
@@ -6002,7 +6002,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v84, s32 offset:240
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v85, s32 offset:244
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v86, s32 offset:248
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v87, s32 offset:252
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v96, s32 offset:256
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v97, s32 offset:260
@@ -6066,7 +6066,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v211, s32 offset:492
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v212, s32 offset:496
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v213, s32 offset:500
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v214, s32 offset:504
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v215, s32 offset:508
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v224, s32 offset:512
@@ -6131,7 +6131,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v42 /*v298*/, s32 offset:744
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v43 /*v299*/, s32 offset:748
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v44 /*v300*/, s32 offset:752
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v45 /*v301*/, s32 offset:756
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v46 /*v302*/, s32 offset:760
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v47 /*v303*/, s32 offset:764
@@ -6195,7 +6195,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v105 /*v361*/, s32 offset:996
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v106 /*v362*/, s32 offset:1000
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v107 /*v363*/, s32 offset:1004
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v108 /*v364*/, s32 offset:1008
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v109 /*v365*/, s32 offset:1012
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v110 /*v366*/, s32 offset:1016
@@ -6259,7 +6259,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v168 /*v424*/, s32 offset:1248
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v169 /*v425*/, s32 offset:1252
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v170 /*v426*/, s32 offset:1256
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v171 /*v427*/, s32 offset:1260
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v172 /*v428*/, s32 offset:1264
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v173 /*v429*/, s32 offset:1268
@@ -6323,7 +6323,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v231 /*v487*/, s32 offset:1500
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v232 /*v488*/, s32 offset:1504
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v233 /*v489*/, s32 offset:1508
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v234 /*v490*/, s32 offset:1512
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v235 /*v491*/, s32 offset:1516
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v236 /*v492*/, s32 offset:1520
@@ -6388,7 +6388,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v38 /*v550*/, s32 offset:1752
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v39 /*v551*/, s32 offset:1756
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v40 /*v552*/, s32 offset:1760
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v41 /*v553*/, s32 offset:1764
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v42 /*v554*/, s32 offset:1768
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v43 /*v555*/, s32 offset:1772
@@ -6452,7 +6452,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v101 /*v613*/, s32 offset:2004
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v102 /*v614*/, s32 offset:2008
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v103 /*v615*/, s32 offset:2012
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v104 /*v616*/, s32 offset:2016
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v105 /*v617*/, s32 offset:2020
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v106 /*v618*/, s32 offset:2024
@@ -6516,7 +6516,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v164 /*v676*/, s32 offset:2256
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v165 /*v677*/, s32 offset:2260
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v166 /*v678*/, s32 offset:2264
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v167 /*v679*/, s32 offset:2268
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v168 /*v680*/, s32 offset:2272
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v169 /*v681*/, s32 offset:2276
@@ -6580,7 +6580,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v227 /*v739*/, s32 offset:2508
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v228 /*v740*/, s32 offset:2512
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v229 /*v741*/, s32 offset:2516
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v230 /*v742*/, s32 offset:2520
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v231 /*v743*/, s32 offset:2524
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v232 /*v744*/, s32 offset:2528
@@ -6645,7 +6645,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v34 /*v802*/, s32 offset:2760
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v35 /*v803*/, s32 offset:2764
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v36 /*v804*/, s32 offset:2768
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v37 /*v805*/, s32 offset:2772
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v38 /*v806*/, s32 offset:2776
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v39 /*v807*/, s32 offset:2780
@@ -6709,7 +6709,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v97 /*v865*/, s32 offset:3012
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v98 /*v866*/, s32 offset:3016
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v99 /*v867*/, s32 offset:3020
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v100 /*v868*/, s32 offset:3024
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v101 /*v869*/, s32 offset:3028
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v102 /*v870*/, s32 offset:3032
@@ -6773,7 +6773,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v160 /*v928*/, s32 offset:3264
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v161 /*v929*/, s32 offset:3268
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v162 /*v930*/, s32 offset:3272
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v163 /*v931*/, s32 offset:3276
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v164 /*v932*/, s32 offset:3280
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v165 /*v933*/, s32 offset:3284
@@ -6837,7 +6837,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v223 /*v991*/, s32 offset:3516
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v224 /*v992*/, s32 offset:3520
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v225 /*v993*/, s32 offset:3524
-; GFX1250-DAGISEL-NEXT:    s_clause 0x1d
+; GFX1250-DAGISEL-NEXT:    s_clause 0x1d ; 120-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v226 /*v994*/, s32 offset:3528
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v227 /*v995*/, s32 offset:3532
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v228 /*v996*/, s32 offset:3536
@@ -6875,7 +6875,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0
 ; GFX1250-DAGISEL-NEXT:    v_swap_b32 v0, v1
 ; GFX1250-DAGISEL-NEXT:    s_xor_b32 exec_lo, s0, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0, off, s32
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1, off, s32 offset:4
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2, off, s32 offset:8
@@ -6939,7 +6939,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v84, off, s32 offset:240
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v85, off, s32 offset:244
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v86, off, s32 offset:248
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v87, off, s32 offset:252
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v96, off, s32 offset:256
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v97, off, s32 offset:260
@@ -7003,7 +7003,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v211, off, s32 offset:492
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v212, off, s32 offset:496
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v213, off, s32 offset:500
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v214, off, s32 offset:504
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v215, off, s32 offset:508
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v224, off, s32 offset:512
@@ -7068,7 +7068,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v42 /*v298*/, off, s32 offset:744
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v43 /*v299*/, off, s32 offset:748
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v44 /*v300*/, off, s32 offset:752
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v45 /*v301*/, off, s32 offset:756
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v46 /*v302*/, off, s32 offset:760
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v47 /*v303*/, off, s32 offset:764
@@ -7132,7 +7132,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v105 /*v361*/, off, s32 offset:996
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v106 /*v362*/, off, s32 offset:1000
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v107 /*v363*/, off, s32 offset:1004
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v108 /*v364*/, off, s32 offset:1008
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v109 /*v365*/, off, s32 offset:1012
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v110 /*v366*/, off, s32 offset:1016
@@ -7196,7 +7196,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v168 /*v424*/, off, s32 offset:1248
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v169 /*v425*/, off, s32 offset:1252
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v170 /*v426*/, off, s32 offset:1256
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v171 /*v427*/, off, s32 offset:1260
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v172 /*v428*/, off, s32 offset:1264
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v173 /*v429*/, off, s32 offset:1268
@@ -7260,7 +7260,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v231 /*v487*/, off, s32 offset:1500
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v232 /*v488*/, off, s32 offset:1504
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v233 /*v489*/, off, s32 offset:1508
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v234 /*v490*/, off, s32 offset:1512
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v235 /*v491*/, off, s32 offset:1516
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v236 /*v492*/, off, s32 offset:1520
@@ -7325,7 +7325,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v38 /*v550*/, off, s32 offset:1752
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v39 /*v551*/, off, s32 offset:1756
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v40 /*v552*/, off, s32 offset:1760
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v41 /*v553*/, off, s32 offset:1764
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v42 /*v554*/, off, s32 offset:1768
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v43 /*v555*/, off, s32 offset:1772
@@ -7389,7 +7389,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v101 /*v613*/, off, s32 offset:2004
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v102 /*v614*/, off, s32 offset:2008
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v103 /*v615*/, off, s32 offset:2012
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v104 /*v616*/, off, s32 offset:2016
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v105 /*v617*/, off, s32 offset:2020
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v106 /*v618*/, off, s32 offset:2024
@@ -7453,7 +7453,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v164 /*v676*/, off, s32 offset:2256
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v165 /*v677*/, off, s32 offset:2260
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v166 /*v678*/, off, s32 offset:2264
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v167 /*v679*/, off, s32 offset:2268
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v168 /*v680*/, off, s32 offset:2272
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v169 /*v681*/, off, s32 offset:2276
@@ -7517,7 +7517,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v227 /*v739*/, off, s32 offset:2508
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v228 /*v740*/, off, s32 offset:2512
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v229 /*v741*/, off, s32 offset:2516
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v230 /*v742*/, off, s32 offset:2520
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v231 /*v743*/, off, s32 offset:2524
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v232 /*v744*/, off, s32 offset:2528
@@ -7582,7 +7582,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v34 /*v802*/, off, s32 offset:2760
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v35 /*v803*/, off, s32 offset:2764
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v36 /*v804*/, off, s32 offset:2768
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v37 /*v805*/, off, s32 offset:2772
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v38 /*v806*/, off, s32 offset:2776
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v39 /*v807*/, off, s32 offset:2780
@@ -7646,7 +7646,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v97 /*v865*/, off, s32 offset:3012
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v98 /*v866*/, off, s32 offset:3016
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v99 /*v867*/, off, s32 offset:3020
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v100 /*v868*/, off, s32 offset:3024
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v101 /*v869*/, off, s32 offset:3028
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v102 /*v870*/, off, s32 offset:3032
@@ -7710,7 +7710,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v160 /*v928*/, off, s32 offset:3264
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v161 /*v929*/, off, s32 offset:3268
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v162 /*v930*/, off, s32 offset:3272
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v163 /*v931*/, off, s32 offset:3276
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v164 /*v932*/, off, s32 offset:3280
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v165 /*v933*/, off, s32 offset:3284
@@ -7774,7 +7774,7 @@ define amdgpu_gfx_whole_wave <2 x half> @tail_call_gfx_from_whole_wave(i1 %activ
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v223 /*v991*/, off, s32 offset:3516
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v224 /*v992*/, off, s32 offset:3520
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v225 /*v993*/, off, s32 offset:3524
-; GFX1250-DAGISEL-NEXT:    s_clause 0x1d
+; GFX1250-DAGISEL-NEXT:    s_clause 0x1d ; 120-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v226 /*v994*/, off, s32 offset:3528
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v227 /*v995*/, off, s32 offset:3532
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v228 /*v996*/, off, s32 offset:3536
@@ -7860,8 +7860,9 @@ define amdgpu_cs void @call_from_entry(<8 x float> %x, ptr %p) {
 ;
 ; GFX1250-DAGISEL-LABEL: call_from_entry:
 ; GFX1250-DAGISEL:       ; %bb.0:
-; GFX1250-DAGISEL-NEXT:    s_mov_b64 s[0:1], callee at abs64
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 s32, 0
+; GFX1250-DAGISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
+; GFX1250-DAGISEL-NEXT:    s_mov_b64 s[0:1], callee at abs64
 ; GFX1250-DAGISEL-NEXT:    v_dual_mov_b32 v41, v9 :: v_dual_mov_b32 v40, v8
 ; GFX1250-DAGISEL-NEXT:    s_swap_pc_i64 s[30:31], s[0:1]
 ; GFX1250-DAGISEL-NEXT:    flat_store_b32 v[40:41], v0
@@ -7882,7 +7883,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL-NEXT:    s_mov_b32 s0, s33
 ; DAGISEL-NEXT:    s_mov_b32 s33, s32
 ; DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v0, s33 offset:4
 ; DAGISEL-NEXT:    scratch_store_b32 off, v1, s33 offset:8
 ; DAGISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:12
@@ -7915,7 +7916,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL-NEXT:    scratch_store_b32 off, v29, s33 offset:120
 ; DAGISEL-NEXT:    scratch_store_b32 off, v30, s33 offset:124
 ; DAGISEL-NEXT:    scratch_store_b32 off, v31, s33 offset:128
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v32, s33 offset:132
 ; DAGISEL-NEXT:    scratch_store_b32 off, v33, s33 offset:136
 ; DAGISEL-NEXT:    scratch_store_b32 off, v34, s33 offset:140
@@ -7948,7 +7949,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL-NEXT:    scratch_store_b32 off, v85, s33 offset:256
 ; DAGISEL-NEXT:    scratch_store_b32 off, v86, s33 offset:260
 ; DAGISEL-NEXT:    scratch_store_b32 off, v87, s33 offset:264
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v96, s33 offset:268
 ; DAGISEL-NEXT:    scratch_store_b32 off, v97, s33 offset:272
 ; DAGISEL-NEXT:    scratch_store_b32 off, v98, s33 offset:276
@@ -7981,7 +7982,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL-NEXT:    scratch_store_b32 off, v149, s33 offset:384
 ; DAGISEL-NEXT:    scratch_store_b32 off, v150, s33 offset:388
 ; DAGISEL-NEXT:    scratch_store_b32 off, v151, s33 offset:392
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v160, s33 offset:396
 ; DAGISEL-NEXT:    scratch_store_b32 off, v161, s33 offset:400
 ; DAGISEL-NEXT:    scratch_store_b32 off, v162, s33 offset:404
@@ -8014,7 +8015,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL-NEXT:    scratch_store_b32 off, v213, s33 offset:512
 ; DAGISEL-NEXT:    scratch_store_b32 off, v214, s33 offset:516
 ; DAGISEL-NEXT:    scratch_store_b32 off, v215, s33 offset:520
-; DAGISEL-NEXT:    s_clause 0xf
+; DAGISEL-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v224, s33 offset:524
 ; DAGISEL-NEXT:    scratch_store_b32 off, v225, s33 offset:528
 ; DAGISEL-NEXT:    scratch_store_b32 off, v226, s33 offset:532
@@ -8032,7 +8033,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL-NEXT:    scratch_store_b32 off, v246, s33 offset:580
 ; DAGISEL-NEXT:    scratch_store_b32 off, v247, s33 offset:584
 ; DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
-; DAGISEL-NEXT:    s_clause 0x2
+; DAGISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Spill
 ; DAGISEL-NEXT:    scratch_store_b32 off, v42, s33
 ; DAGISEL-NEXT:    scratch_store_b32 off, v40, s33 offset:164
 ; DAGISEL-NEXT:    scratch_store_b32 off, v41, s33 offset:168
@@ -8052,13 +8053,13 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL-NEXT:    v_readlane_b32 s30, v42, 1
 ; DAGISEL-NEXT:    v_readlane_b32 s4, v42, 0
 ; DAGISEL-NEXT:    v_readlane_b32 s0, v42, 3
-; DAGISEL-NEXT:    s_clause 0x2
+; DAGISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v42, off, s33
 ; DAGISEL-NEXT:    scratch_load_b32 v40, off, s33 offset:164
 ; DAGISEL-NEXT:    scratch_load_b32 v41, off, s33 offset:168
 ; DAGISEL-NEXT:    s_mov_b32 s32, s33
 ; DAGISEL-NEXT:    s_xor_b32 exec_lo, s4, -1
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v0, off, s33 offset:4
 ; DAGISEL-NEXT:    scratch_load_b32 v1, off, s33 offset:8
 ; DAGISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:12
@@ -8091,7 +8092,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL-NEXT:    scratch_load_b32 v29, off, s33 offset:120
 ; DAGISEL-NEXT:    scratch_load_b32 v30, off, s33 offset:124
 ; DAGISEL-NEXT:    scratch_load_b32 v31, off, s33 offset:128
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v32, off, s33 offset:132
 ; DAGISEL-NEXT:    scratch_load_b32 v33, off, s33 offset:136
 ; DAGISEL-NEXT:    scratch_load_b32 v34, off, s33 offset:140
@@ -8124,7 +8125,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL-NEXT:    scratch_load_b32 v85, off, s33 offset:256
 ; DAGISEL-NEXT:    scratch_load_b32 v86, off, s33 offset:260
 ; DAGISEL-NEXT:    scratch_load_b32 v87, off, s33 offset:264
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v96, off, s33 offset:268
 ; DAGISEL-NEXT:    scratch_load_b32 v97, off, s33 offset:272
 ; DAGISEL-NEXT:    scratch_load_b32 v98, off, s33 offset:276
@@ -8157,7 +8158,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL-NEXT:    scratch_load_b32 v149, off, s33 offset:384
 ; DAGISEL-NEXT:    scratch_load_b32 v150, off, s33 offset:388
 ; DAGISEL-NEXT:    scratch_load_b32 v151, off, s33 offset:392
-; DAGISEL-NEXT:    s_clause 0x1f
+; DAGISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v160, off, s33 offset:396
 ; DAGISEL-NEXT:    scratch_load_b32 v161, off, s33 offset:400
 ; DAGISEL-NEXT:    scratch_load_b32 v162, off, s33 offset:404
@@ -8190,7 +8191,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL-NEXT:    scratch_load_b32 v213, off, s33 offset:512
 ; DAGISEL-NEXT:    scratch_load_b32 v214, off, s33 offset:516
 ; DAGISEL-NEXT:    scratch_load_b32 v215, off, s33 offset:520
-; DAGISEL-NEXT:    s_clause 0xf
+; DAGISEL-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; DAGISEL-NEXT:    scratch_load_b32 v224, off, s33 offset:524
 ; DAGISEL-NEXT:    scratch_load_b32 v225, off, s33 offset:528
 ; DAGISEL-NEXT:    scratch_load_b32 v226, off, s33 offset:532
@@ -8223,7 +8224,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL-NEXT:    s_mov_b32 s0, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_xor_saveexec_b32 s4, -1
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v0, s33 offset:4
 ; GISEL-NEXT:    scratch_store_b32 off, v1, s33 offset:8
 ; GISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:12
@@ -8256,7 +8257,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL-NEXT:    scratch_store_b32 off, v29, s33 offset:120
 ; GISEL-NEXT:    scratch_store_b32 off, v30, s33 offset:124
 ; GISEL-NEXT:    scratch_store_b32 off, v31, s33 offset:128
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v32, s33 offset:132
 ; GISEL-NEXT:    scratch_store_b32 off, v33, s33 offset:136
 ; GISEL-NEXT:    scratch_store_b32 off, v34, s33 offset:140
@@ -8289,7 +8290,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL-NEXT:    scratch_store_b32 off, v85, s33 offset:256
 ; GISEL-NEXT:    scratch_store_b32 off, v86, s33 offset:260
 ; GISEL-NEXT:    scratch_store_b32 off, v87, s33 offset:264
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v96, s33 offset:268
 ; GISEL-NEXT:    scratch_store_b32 off, v97, s33 offset:272
 ; GISEL-NEXT:    scratch_store_b32 off, v98, s33 offset:276
@@ -8322,7 +8323,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL-NEXT:    scratch_store_b32 off, v149, s33 offset:384
 ; GISEL-NEXT:    scratch_store_b32 off, v150, s33 offset:388
 ; GISEL-NEXT:    scratch_store_b32 off, v151, s33 offset:392
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v160, s33 offset:396
 ; GISEL-NEXT:    scratch_store_b32 off, v161, s33 offset:400
 ; GISEL-NEXT:    scratch_store_b32 off, v162, s33 offset:404
@@ -8355,7 +8356,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL-NEXT:    scratch_store_b32 off, v213, s33 offset:512
 ; GISEL-NEXT:    scratch_store_b32 off, v214, s33 offset:516
 ; GISEL-NEXT:    scratch_store_b32 off, v215, s33 offset:520
-; GISEL-NEXT:    s_clause 0xf
+; GISEL-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v224, s33 offset:524
 ; GISEL-NEXT:    scratch_store_b32 off, v225, s33 offset:528
 ; GISEL-NEXT:    scratch_store_b32 off, v226, s33 offset:532
@@ -8373,7 +8374,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL-NEXT:    scratch_store_b32 off, v246, s33 offset:580
 ; GISEL-NEXT:    scratch_store_b32 off, v247, s33 offset:584
 ; GISEL-NEXT:    s_mov_b32 exec_lo, -1
-; GISEL-NEXT:    s_clause 0x2
+; GISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Spill
 ; GISEL-NEXT:    scratch_store_b32 off, v42, s33
 ; GISEL-NEXT:    scratch_store_b32 off, v40, s33 offset:164
 ; GISEL-NEXT:    scratch_store_b32 off, v41, s33 offset:168
@@ -8393,13 +8394,13 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL-NEXT:    v_readlane_b32 s30, v42, 1
 ; GISEL-NEXT:    v_readlane_b32 s4, v42, 0
 ; GISEL-NEXT:    v_readlane_b32 s0, v42, 3
-; GISEL-NEXT:    s_clause 0x2
+; GISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v42, off, s33
 ; GISEL-NEXT:    scratch_load_b32 v40, off, s33 offset:164
 ; GISEL-NEXT:    scratch_load_b32 v41, off, s33 offset:168
 ; GISEL-NEXT:    s_mov_b32 s32, s33
 ; GISEL-NEXT:    s_xor_b32 exec_lo, s4, -1
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v0, off, s33 offset:4
 ; GISEL-NEXT:    scratch_load_b32 v1, off, s33 offset:8
 ; GISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:12
@@ -8432,7 +8433,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL-NEXT:    scratch_load_b32 v29, off, s33 offset:120
 ; GISEL-NEXT:    scratch_load_b32 v30, off, s33 offset:124
 ; GISEL-NEXT:    scratch_load_b32 v31, off, s33 offset:128
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v32, off, s33 offset:132
 ; GISEL-NEXT:    scratch_load_b32 v33, off, s33 offset:136
 ; GISEL-NEXT:    scratch_load_b32 v34, off, s33 offset:140
@@ -8465,7 +8466,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL-NEXT:    scratch_load_b32 v85, off, s33 offset:256
 ; GISEL-NEXT:    scratch_load_b32 v86, off, s33 offset:260
 ; GISEL-NEXT:    scratch_load_b32 v87, off, s33 offset:264
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v96, off, s33 offset:268
 ; GISEL-NEXT:    scratch_load_b32 v97, off, s33 offset:272
 ; GISEL-NEXT:    scratch_load_b32 v98, off, s33 offset:276
@@ -8498,7 +8499,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL-NEXT:    scratch_load_b32 v149, off, s33 offset:384
 ; GISEL-NEXT:    scratch_load_b32 v150, off, s33 offset:388
 ; GISEL-NEXT:    scratch_load_b32 v151, off, s33 offset:392
-; GISEL-NEXT:    s_clause 0x1f
+; GISEL-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v160, off, s33 offset:396
 ; GISEL-NEXT:    scratch_load_b32 v161, off, s33 offset:400
 ; GISEL-NEXT:    scratch_load_b32 v162, off, s33 offset:404
@@ -8531,7 +8532,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL-NEXT:    scratch_load_b32 v213, off, s33 offset:512
 ; GISEL-NEXT:    scratch_load_b32 v214, off, s33 offset:516
 ; GISEL-NEXT:    scratch_load_b32 v215, off, s33 offset:520
-; GISEL-NEXT:    s_clause 0xf
+; GISEL-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GISEL-NEXT:    scratch_load_b32 v224, off, s33 offset:524
 ; GISEL-NEXT:    scratch_load_b32 v225, off, s33 offset:528
 ; GISEL-NEXT:    scratch_load_b32 v226, off, s33 offset:532
@@ -8564,7 +8565,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL64-NEXT:    s_mov_b32 s0, s33
 ; DAGISEL64-NEXT:    s_mov_b32 s33, s32
 ; DAGISEL64-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v0, s33 offset:4
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v1, s33 offset:8
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v2, s33 offset:12
@@ -8597,7 +8598,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v29, s33 offset:120
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v30, s33 offset:124
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v31, s33 offset:128
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v32, s33 offset:132
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v33, s33 offset:136
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v34, s33 offset:140
@@ -8630,7 +8631,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v85, s33 offset:256
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v86, s33 offset:260
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v87, s33 offset:264
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v96, s33 offset:268
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v97, s33 offset:272
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v98, s33 offset:276
@@ -8663,7 +8664,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v149, s33 offset:384
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v150, s33 offset:388
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v151, s33 offset:392
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v160, s33 offset:396
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v161, s33 offset:400
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v162, s33 offset:404
@@ -8696,7 +8697,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v213, s33 offset:512
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v214, s33 offset:516
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v215, s33 offset:520
-; DAGISEL64-NEXT:    s_clause 0xf
+; DAGISEL64-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v224, s33 offset:524
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v225, s33 offset:528
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v226, s33 offset:532
@@ -8714,7 +8715,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v246, s33 offset:580
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v247, s33 offset:584
 ; DAGISEL64-NEXT:    s_mov_b64 exec, -1
-; DAGISEL64-NEXT:    s_clause 0x2
+; DAGISEL64-NEXT:    s_clause 0x2 ; 12-byte Folded Spill
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v42, s33
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v40, s33 offset:164
 ; DAGISEL64-NEXT:    scratch_store_b32 off, v41, s33 offset:168
@@ -8737,13 +8738,13 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL64-NEXT:    v_readlane_b32 s5, v42, 1
 ; DAGISEL64-NEXT:    v_readlane_b32 s4, v42, 0
 ; DAGISEL64-NEXT:    v_readlane_b32 s0, v42, 4
-; DAGISEL64-NEXT:    s_clause 0x2
+; DAGISEL64-NEXT:    s_clause 0x2 ; 12-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v42, off, s33
 ; DAGISEL64-NEXT:    scratch_load_b32 v40, off, s33 offset:164
 ; DAGISEL64-NEXT:    scratch_load_b32 v41, off, s33 offset:168
 ; DAGISEL64-NEXT:    s_mov_b32 s32, s33
 ; DAGISEL64-NEXT:    s_xor_b64 exec, s[4:5], -1
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v0, off, s33 offset:4
 ; DAGISEL64-NEXT:    scratch_load_b32 v1, off, s33 offset:8
 ; DAGISEL64-NEXT:    scratch_load_b32 v2, off, s33 offset:12
@@ -8776,7 +8777,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL64-NEXT:    scratch_load_b32 v29, off, s33 offset:120
 ; DAGISEL64-NEXT:    scratch_load_b32 v30, off, s33 offset:124
 ; DAGISEL64-NEXT:    scratch_load_b32 v31, off, s33 offset:128
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v32, off, s33 offset:132
 ; DAGISEL64-NEXT:    scratch_load_b32 v33, off, s33 offset:136
 ; DAGISEL64-NEXT:    scratch_load_b32 v34, off, s33 offset:140
@@ -8809,7 +8810,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL64-NEXT:    scratch_load_b32 v85, off, s33 offset:256
 ; DAGISEL64-NEXT:    scratch_load_b32 v86, off, s33 offset:260
 ; DAGISEL64-NEXT:    scratch_load_b32 v87, off, s33 offset:264
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v96, off, s33 offset:268
 ; DAGISEL64-NEXT:    scratch_load_b32 v97, off, s33 offset:272
 ; DAGISEL64-NEXT:    scratch_load_b32 v98, off, s33 offset:276
@@ -8842,7 +8843,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL64-NEXT:    scratch_load_b32 v149, off, s33 offset:384
 ; DAGISEL64-NEXT:    scratch_load_b32 v150, off, s33 offset:388
 ; DAGISEL64-NEXT:    scratch_load_b32 v151, off, s33 offset:392
-; DAGISEL64-NEXT:    s_clause 0x1f
+; DAGISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v160, off, s33 offset:396
 ; DAGISEL64-NEXT:    scratch_load_b32 v161, off, s33 offset:400
 ; DAGISEL64-NEXT:    scratch_load_b32 v162, off, s33 offset:404
@@ -8875,7 +8876,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; DAGISEL64-NEXT:    scratch_load_b32 v213, off, s33 offset:512
 ; DAGISEL64-NEXT:    scratch_load_b32 v214, off, s33 offset:516
 ; DAGISEL64-NEXT:    scratch_load_b32 v215, off, s33 offset:520
-; DAGISEL64-NEXT:    s_clause 0xf
+; DAGISEL64-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; DAGISEL64-NEXT:    scratch_load_b32 v224, off, s33 offset:524
 ; DAGISEL64-NEXT:    scratch_load_b32 v225, off, s33 offset:528
 ; DAGISEL64-NEXT:    scratch_load_b32 v226, off, s33 offset:532
@@ -8908,7 +8909,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL64-NEXT:    s_mov_b32 s0, s33
 ; GISEL64-NEXT:    s_mov_b32 s33, s32
 ; GISEL64-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v0, s33 offset:4
 ; GISEL64-NEXT:    scratch_store_b32 off, v1, s33 offset:8
 ; GISEL64-NEXT:    scratch_store_b32 off, v2, s33 offset:12
@@ -8941,7 +8942,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL64-NEXT:    scratch_store_b32 off, v29, s33 offset:120
 ; GISEL64-NEXT:    scratch_store_b32 off, v30, s33 offset:124
 ; GISEL64-NEXT:    scratch_store_b32 off, v31, s33 offset:128
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v32, s33 offset:132
 ; GISEL64-NEXT:    scratch_store_b32 off, v33, s33 offset:136
 ; GISEL64-NEXT:    scratch_store_b32 off, v34, s33 offset:140
@@ -8974,7 +8975,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL64-NEXT:    scratch_store_b32 off, v85, s33 offset:256
 ; GISEL64-NEXT:    scratch_store_b32 off, v86, s33 offset:260
 ; GISEL64-NEXT:    scratch_store_b32 off, v87, s33 offset:264
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v96, s33 offset:268
 ; GISEL64-NEXT:    scratch_store_b32 off, v97, s33 offset:272
 ; GISEL64-NEXT:    scratch_store_b32 off, v98, s33 offset:276
@@ -9007,7 +9008,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL64-NEXT:    scratch_store_b32 off, v149, s33 offset:384
 ; GISEL64-NEXT:    scratch_store_b32 off, v150, s33 offset:388
 ; GISEL64-NEXT:    scratch_store_b32 off, v151, s33 offset:392
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v160, s33 offset:396
 ; GISEL64-NEXT:    scratch_store_b32 off, v161, s33 offset:400
 ; GISEL64-NEXT:    scratch_store_b32 off, v162, s33 offset:404
@@ -9040,7 +9041,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL64-NEXT:    scratch_store_b32 off, v213, s33 offset:512
 ; GISEL64-NEXT:    scratch_store_b32 off, v214, s33 offset:516
 ; GISEL64-NEXT:    scratch_store_b32 off, v215, s33 offset:520
-; GISEL64-NEXT:    s_clause 0xf
+; GISEL64-NEXT:    s_clause 0xf ; 64-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v224, s33 offset:524
 ; GISEL64-NEXT:    scratch_store_b32 off, v225, s33 offset:528
 ; GISEL64-NEXT:    scratch_store_b32 off, v226, s33 offset:532
@@ -9058,7 +9059,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL64-NEXT:    scratch_store_b32 off, v246, s33 offset:580
 ; GISEL64-NEXT:    scratch_store_b32 off, v247, s33 offset:584
 ; GISEL64-NEXT:    s_mov_b64 exec, -1
-; GISEL64-NEXT:    s_clause 0x2
+; GISEL64-NEXT:    s_clause 0x2 ; 12-byte Folded Spill
 ; GISEL64-NEXT:    scratch_store_b32 off, v42, s33
 ; GISEL64-NEXT:    scratch_store_b32 off, v40, s33 offset:164
 ; GISEL64-NEXT:    scratch_store_b32 off, v41, s33 offset:168
@@ -9081,13 +9082,13 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL64-NEXT:    v_readlane_b32 s5, v42, 1
 ; GISEL64-NEXT:    v_readlane_b32 s4, v42, 0
 ; GISEL64-NEXT:    v_readlane_b32 s0, v42, 4
-; GISEL64-NEXT:    s_clause 0x2
+; GISEL64-NEXT:    s_clause 0x2 ; 12-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v42, off, s33
 ; GISEL64-NEXT:    scratch_load_b32 v40, off, s33 offset:164
 ; GISEL64-NEXT:    scratch_load_b32 v41, off, s33 offset:168
 ; GISEL64-NEXT:    s_mov_b32 s32, s33
 ; GISEL64-NEXT:    s_xor_b64 exec, s[4:5], -1
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v0, off, s33 offset:4
 ; GISEL64-NEXT:    scratch_load_b32 v1, off, s33 offset:8
 ; GISEL64-NEXT:    scratch_load_b32 v2, off, s33 offset:12
@@ -9120,7 +9121,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL64-NEXT:    scratch_load_b32 v29, off, s33 offset:120
 ; GISEL64-NEXT:    scratch_load_b32 v30, off, s33 offset:124
 ; GISEL64-NEXT:    scratch_load_b32 v31, off, s33 offset:128
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v32, off, s33 offset:132
 ; GISEL64-NEXT:    scratch_load_b32 v33, off, s33 offset:136
 ; GISEL64-NEXT:    scratch_load_b32 v34, off, s33 offset:140
@@ -9153,7 +9154,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL64-NEXT:    scratch_load_b32 v85, off, s33 offset:256
 ; GISEL64-NEXT:    scratch_load_b32 v86, off, s33 offset:260
 ; GISEL64-NEXT:    scratch_load_b32 v87, off, s33 offset:264
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v96, off, s33 offset:268
 ; GISEL64-NEXT:    scratch_load_b32 v97, off, s33 offset:272
 ; GISEL64-NEXT:    scratch_load_b32 v98, off, s33 offset:276
@@ -9186,7 +9187,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL64-NEXT:    scratch_load_b32 v149, off, s33 offset:384
 ; GISEL64-NEXT:    scratch_load_b32 v150, off, s33 offset:388
 ; GISEL64-NEXT:    scratch_load_b32 v151, off, s33 offset:392
-; GISEL64-NEXT:    s_clause 0x1f
+; GISEL64-NEXT:    s_clause 0x1f ; 128-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v160, off, s33 offset:396
 ; GISEL64-NEXT:    scratch_load_b32 v161, off, s33 offset:400
 ; GISEL64-NEXT:    scratch_load_b32 v162, off, s33 offset:404
@@ -9219,7 +9220,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GISEL64-NEXT:    scratch_load_b32 v213, off, s33 offset:512
 ; GISEL64-NEXT:    scratch_load_b32 v214, off, s33 offset:516
 ; GISEL64-NEXT:    scratch_load_b32 v215, off, s33 offset:520
-; GISEL64-NEXT:    s_clause 0xf
+; GISEL64-NEXT:    s_clause 0xf ; 64-byte Folded Reload
 ; GISEL64-NEXT:    scratch_load_b32 v224, off, s33 offset:524
 ; GISEL64-NEXT:    scratch_load_b32 v225, off, s33 offset:528
 ; GISEL64-NEXT:    scratch_load_b32 v226, off, s33 offset:532
@@ -9249,7 +9250,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 s0, s33
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 s33, s32
 ; GFX1250-DAGISEL-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v0, s33 offset:4
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v1, s33 offset:8
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v2, s33 offset:12
@@ -9313,7 +9314,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v84, s33 offset:252
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v85, s33 offset:256
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v86, s33 offset:260
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v87, s33 offset:264
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v96, s33 offset:268
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v97, s33 offset:272
@@ -9377,7 +9378,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v211, s33 offset:504
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v212, s33 offset:508
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v213, s33 offset:512
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v214, s33 offset:516
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v215, s33 offset:520
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v224, s33 offset:524
@@ -9442,7 +9443,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v42 /*v298*/, s33 offset:756
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v43 /*v299*/, s33 offset:760
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v44 /*v300*/, s33 offset:764
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v45 /*v301*/, s33 offset:768
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v46 /*v302*/, s33 offset:772
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v47 /*v303*/, s33 offset:776
@@ -9506,7 +9507,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v105 /*v361*/, s33 offset:1008
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v106 /*v362*/, s33 offset:1012
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v107 /*v363*/, s33 offset:1016
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v108 /*v364*/, s33 offset:1020
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v109 /*v365*/, s33 offset:1024
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v110 /*v366*/, s33 offset:1028
@@ -9570,7 +9571,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v168 /*v424*/, s33 offset:1260
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v169 /*v425*/, s33 offset:1264
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v170 /*v426*/, s33 offset:1268
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v171 /*v427*/, s33 offset:1272
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v172 /*v428*/, s33 offset:1276
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v173 /*v429*/, s33 offset:1280
@@ -9634,7 +9635,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v231 /*v487*/, s33 offset:1512
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v232 /*v488*/, s33 offset:1516
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v233 /*v489*/, s33 offset:1520
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v234 /*v490*/, s33 offset:1524
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v235 /*v491*/, s33 offset:1528
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v236 /*v492*/, s33 offset:1532
@@ -9699,7 +9700,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v38 /*v550*/, s33 offset:1764
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v39 /*v551*/, s33 offset:1768
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v40 /*v552*/, s33 offset:1772
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v41 /*v553*/, s33 offset:1776
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v42 /*v554*/, s33 offset:1780
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v43 /*v555*/, s33 offset:1784
@@ -9763,7 +9764,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v101 /*v613*/, s33 offset:2016
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v102 /*v614*/, s33 offset:2020
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v103 /*v615*/, s33 offset:2024
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v104 /*v616*/, s33 offset:2028
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v105 /*v617*/, s33 offset:2032
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v106 /*v618*/, s33 offset:2036
@@ -9827,7 +9828,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v164 /*v676*/, s33 offset:2268
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v165 /*v677*/, s33 offset:2272
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v166 /*v678*/, s33 offset:2276
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v167 /*v679*/, s33 offset:2280
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v168 /*v680*/, s33 offset:2284
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v169 /*v681*/, s33 offset:2288
@@ -9891,7 +9892,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v227 /*v739*/, s33 offset:2520
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v228 /*v740*/, s33 offset:2524
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v229 /*v741*/, s33 offset:2528
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v230 /*v742*/, s33 offset:2532
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v231 /*v743*/, s33 offset:2536
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v232 /*v744*/, s33 offset:2540
@@ -9956,7 +9957,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v34 /*v802*/, s33 offset:2772
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v35 /*v803*/, s33 offset:2776
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v36 /*v804*/, s33 offset:2780
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v37 /*v805*/, s33 offset:2784
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v38 /*v806*/, s33 offset:2788
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v39 /*v807*/, s33 offset:2792
@@ -10020,7 +10021,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v97 /*v865*/, s33 offset:3024
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v98 /*v866*/, s33 offset:3028
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v99 /*v867*/, s33 offset:3032
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v100 /*v868*/, s33 offset:3036
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v101 /*v869*/, s33 offset:3040
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v102 /*v870*/, s33 offset:3044
@@ -10084,7 +10085,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v160 /*v928*/, s33 offset:3276
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v161 /*v929*/, s33 offset:3280
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v162 /*v930*/, s33 offset:3284
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v163 /*v931*/, s33 offset:3288
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v164 /*v932*/, s33 offset:3292
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v165 /*v933*/, s33 offset:3296
@@ -10148,7 +10149,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v223 /*v991*/, s33 offset:3528
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v224 /*v992*/, s33 offset:3532
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v225 /*v993*/, s33 offset:3536
-; GFX1250-DAGISEL-NEXT:    s_clause 0x1d
+; GFX1250-DAGISEL-NEXT:    s_clause 0x1d ; 120-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v226 /*v994*/, s33 offset:3540
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v227 /*v995*/, s33 offset:3544
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v228 /*v996*/, s33 offset:3548
@@ -10182,7 +10183,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 exec_lo, -1
 ; GFX1250-DAGISEL-NEXT:    s_set_vgpr_msb 0xc00 ; msbs: dst=0 src0=0 src1=0 src2=0
-; GFX1250-DAGISEL-NEXT:    s_clause 0x2
+; GFX1250-DAGISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Spill
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v42, s33
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v40, s33 offset:164
 ; GFX1250-DAGISEL-NEXT:    scratch_store_b32 off, v41, s33 offset:168
@@ -10201,14 +10202,14 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s30, v42, 1
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s4, v42, 0
 ; GFX1250-DAGISEL-NEXT:    v_readlane_b32 s0, v42, 3
-; GFX1250-DAGISEL-NEXT:    s_clause 0x2
+; GFX1250-DAGISEL-NEXT:    s_clause 0x2 ; 12-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v42, off, s33
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v40, off, s33 offset:164
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v41, off, s33 offset:168
 ; GFX1250-DAGISEL-NEXT:    s_mov_b32 s32, s33
 ; GFX1250-DAGISEL-NEXT:    s_wait_xcnt 0x0
 ; GFX1250-DAGISEL-NEXT:    s_xor_b32 exec_lo, s4, -1
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v0, off, s33 offset:4
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v1, off, s33 offset:8
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v2, off, s33 offset:12
@@ -10272,7 +10273,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v84, off, s33 offset:252
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v85, off, s33 offset:256
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v86, off, s33 offset:260
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v87, off, s33 offset:264
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v96, off, s33 offset:268
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v97, off, s33 offset:272
@@ -10336,7 +10337,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v211, off, s33 offset:504
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v212, off, s33 offset:508
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v213, off, s33 offset:512
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v214, off, s33 offset:516
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v215, off, s33 offset:520
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v224, off, s33 offset:524
@@ -10401,7 +10402,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v42 /*v298*/, off, s33 offset:756
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v43 /*v299*/, off, s33 offset:760
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v44 /*v300*/, off, s33 offset:764
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v45 /*v301*/, off, s33 offset:768
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v46 /*v302*/, off, s33 offset:772
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v47 /*v303*/, off, s33 offset:776
@@ -10465,7 +10466,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v105 /*v361*/, off, s33 offset:1008
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v106 /*v362*/, off, s33 offset:1012
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v107 /*v363*/, off, s33 offset:1016
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v108 /*v364*/, off, s33 offset:1020
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v109 /*v365*/, off, s33 offset:1024
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v110 /*v366*/, off, s33 offset:1028
@@ -10529,7 +10530,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v168 /*v424*/, off, s33 offset:1260
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v169 /*v425*/, off, s33 offset:1264
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v170 /*v426*/, off, s33 offset:1268
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v171 /*v427*/, off, s33 offset:1272
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v172 /*v428*/, off, s33 offset:1276
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v173 /*v429*/, off, s33 offset:1280
@@ -10593,7 +10594,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v231 /*v487*/, off, s33 offset:1512
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v232 /*v488*/, off, s33 offset:1516
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v233 /*v489*/, off, s33 offset:1520
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v234 /*v490*/, off, s33 offset:1524
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v235 /*v491*/, off, s33 offset:1528
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v236 /*v492*/, off, s33 offset:1532
@@ -10658,7 +10659,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v38 /*v550*/, off, s33 offset:1764
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v39 /*v551*/, off, s33 offset:1768
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v40 /*v552*/, off, s33 offset:1772
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v41 /*v553*/, off, s33 offset:1776
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v42 /*v554*/, off, s33 offset:1780
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v43 /*v555*/, off, s33 offset:1784
@@ -10722,7 +10723,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v101 /*v613*/, off, s33 offset:2016
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v102 /*v614*/, off, s33 offset:2020
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v103 /*v615*/, off, s33 offset:2024
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v104 /*v616*/, off, s33 offset:2028
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v105 /*v617*/, off, s33 offset:2032
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v106 /*v618*/, off, s33 offset:2036
@@ -10786,7 +10787,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v164 /*v676*/, off, s33 offset:2268
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v165 /*v677*/, off, s33 offset:2272
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v166 /*v678*/, off, s33 offset:2276
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v167 /*v679*/, off, s33 offset:2280
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v168 /*v680*/, off, s33 offset:2284
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v169 /*v681*/, off, s33 offset:2288
@@ -10850,7 +10851,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v227 /*v739*/, off, s33 offset:2520
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v228 /*v740*/, off, s33 offset:2524
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v229 /*v741*/, off, s33 offset:2528
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v230 /*v742*/, off, s33 offset:2532
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v231 /*v743*/, off, s33 offset:2536
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v232 /*v744*/, off, s33 offset:2540
@@ -10915,7 +10916,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v34 /*v802*/, off, s33 offset:2772
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v35 /*v803*/, off, s33 offset:2776
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v36 /*v804*/, off, s33 offset:2780
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v37 /*v805*/, off, s33 offset:2784
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v38 /*v806*/, off, s33 offset:2788
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v39 /*v807*/, off, s33 offset:2792
@@ -10979,7 +10980,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v97 /*v865*/, off, s33 offset:3024
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v98 /*v866*/, off, s33 offset:3028
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v99 /*v867*/, off, s33 offset:3032
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v100 /*v868*/, off, s33 offset:3036
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v101 /*v869*/, off, s33 offset:3040
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v102 /*v870*/, off, s33 offset:3044
@@ -11043,7 +11044,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v160 /*v928*/, off, s33 offset:3276
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v161 /*v929*/, off, s33 offset:3280
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v162 /*v930*/, off, s33 offset:3284
-; GFX1250-DAGISEL-NEXT:    s_clause 0x3e
+; GFX1250-DAGISEL-NEXT:    s_clause 0x3e ; 252-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v163 /*v931*/, off, s33 offset:3288
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v164 /*v932*/, off, s33 offset:3292
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v165 /*v933*/, off, s33 offset:3296
@@ -11107,7 +11108,7 @@ define amdgpu_gfx_whole_wave void @call_from_whole_wave(i1 %unused, <8 x float>
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v223 /*v991*/, off, s33 offset:3528
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v224 /*v992*/, off, s33 offset:3532
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v225 /*v993*/, off, s33 offset:3536
-; GFX1250-DAGISEL-NEXT:    s_clause 0x1d
+; GFX1250-DAGISEL-NEXT:    s_clause 0x1d ; 120-byte Folded Reload
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v226 /*v994*/, off, s33 offset:3540
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v227 /*v995*/, off, s33 offset:3544
 ; GFX1250-DAGISEL-NEXT:    scratch_load_b32 v228 /*v996*/, off, s33 offset:3548

diff  --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
index 6636eb544343b..59968bc206ebb 100644
--- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
@@ -27,6 +27,7 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) {
 ;
 ; GFX1250-SDAG-LABEL: workgroup_id_x:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-SDAG-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
 ; GFX1250-SDAG-NEXT:    s_and_b32 s3, ttmp6, 15
@@ -45,6 +46,7 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) {
 ;
 ; GFX1250-GISEL-LABEL: workgroup_id_x:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_load_b64 s[0:1], s[4:5], 0x0
 ; GFX1250-GISEL-NEXT:    s_bfe_u32 s2, ttmp6, 0x4000c
 ; GFX1250-GISEL-NEXT:    s_and_b32 s3, ttmp6, 15
@@ -101,6 +103,7 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace
 ;
 ; GFX1250-SDAG-LABEL: workgroup_id_xy:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-SDAG-NEXT:    s_bfe_u32 s6, ttmp6, 0x40010
 ; GFX1250-SDAG-NEXT:    s_wait_xcnt 0x0
@@ -128,6 +131,7 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace
 ;
 ; GFX1250-GISEL-LABEL: workgroup_id_xy:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_bfe_u32 s6, ttmp6, 0x4000c
 ; GFX1250-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
 ; GFX1250-GISEL-NEXT:    s_add_co_i32 s6, s6, 1
@@ -210,6 +214,7 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac
 ;
 ; GFX1250-SDAG-LABEL: workgroup_id_xyz:
 ; GFX1250-SDAG:       ; %bb.0:
+; GFX1250-SDAG-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-SDAG-NEXT:    s_bfe_u32 s0, ttmp6, 0x40014
 ; GFX1250-SDAG-NEXT:    s_lshr_b32 s8, ttmp7, 16
 ; GFX1250-SDAG-NEXT:    s_add_co_i32 s9, s0, 1
@@ -248,6 +253,7 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac
 ;
 ; GFX1250-GISEL-LABEL: workgroup_id_xyz:
 ; GFX1250-GISEL:       ; %bb.0:
+; GFX1250-GISEL-NEXT:    s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
 ; GFX1250-GISEL-NEXT:    s_bfe_u32 s0, ttmp6, 0x4000c
 ; GFX1250-GISEL-NEXT:    s_and_b32 s1, ttmp6, 15
 ; GFX1250-GISEL-NEXT:    s_add_co_i32 s0, s0, 1


        


More information about the llvm-commits mailing list