[llvm] [AMDGPU] Add few missing gfx1250 codegen tests. NFC (PR #155314)

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Mon Aug 25 14:48:36 PDT 2025


https://github.com/rampitec created https://github.com/llvm/llvm-project/pull/155314

None

>From ceb42172adc7ff72a429a0501d4b40f5ab601bc1 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Mon, 25 Aug 2025 14:46:34 -0700
Subject: [PATCH] [AMDGPU] Add few missing gfx1250 codegen tests. NFC

---
 .../CodeGen/AMDGPU/calling-conventions.ll     | 1181 ++++++++++++++++-
 .../AMDGPU/hard-clauses-load-monitor.mir      |   38 +
 .../AMDGPU/insert_vector_elt.v2bf16.ll        |  321 +++++
 llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll      |   30 +-
 .../CodeGen/AMDGPU/scale-offset-scratch.ll    |    6 +-
 5 files changed, 1553 insertions(+), 23 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/hard-clauses-load-monitor.mir

diff --git a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
index 2db7b28c7de97..ddd3b1520bf5e 100644
--- a/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/llvm/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -3,6 +3,8 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1250,GFX1250-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX1250,GFX1250-FAKE16 %s
 
 ; Make sure we don't crash or assert on spir_kernel calling convention.
 
@@ -34,6 +36,14 @@ define spir_kernel void @kernel(ptr addrspace(1) %out) {
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_store_b32 v0, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: kernel:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b32_e32 v0, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_store_b32 v0, v0, s[0:1]
+; GFX1250-NEXT:    s_endpgm
 entry:
   store i32 0, ptr addrspace(1) %out
   ret void
@@ -70,6 +80,16 @@ define amdgpu_ps half @ps_ret_cc_f16(half %arg0) {
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-TRUE16-LABEL: ps_ret_cc_f16:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX1250-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-FAKE16-LABEL: ps_ret_cc_f16:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
@@ -96,26 +116,71 @@ define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) {
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    v_add_f16_e64 v0, s0, 1.0
 ; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: ps_ret_cc_inreg_f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_add_f16 s0, s0, 1.0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
 
 define fastcc float @fastcc(float %arg0) #0 {
-; GCN-LABEL: fastcc:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_add_f32_e32 v0, 4.0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: fastcc:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_add_f32_e32 v0, 4.0, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: fastcc:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_add_f32_e32 v0, 4.0, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: fastcc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f32_e32 v0, 4.0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: fastcc:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_add_f32_e32 v0, 4.0, v0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %add = fadd float %arg0, 4.0
   ret float %add
 }
 
 define coldcc float @coldcc(float %arg0) #0 {
-; GCN-LABEL: coldcc:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_add_f32_e32 v0, 4.0, v0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; SI-LABEL: coldcc:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    v_add_f32_e32 v0, 4.0, v0
+; SI-NEXT:    s_setpc_b64 s[30:31]
+;
+; VI-LABEL: coldcc:
+; VI:       ; %bb.0:
+; VI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT:    v_add_f32_e32 v0, 4.0, v0
+; VI-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: coldcc:
+; GFX11:       ; %bb.0:
+; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT:    v_add_f32_e32 v0, 4.0, v0
+; GFX11-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: coldcc:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_add_f32_e32 v0, 4.0, v0
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
  %add = fadd float %arg0, 4.0
  ret float %add
 }
@@ -209,6 +274,23 @@ define amdgpu_kernel void @call_coldcc() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: call_coldcc:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1250-NEXT:    s_get_pc_i64 s[6:7]
+; GFX1250-NEXT:    s_add_nc_u64 s[6:7], s[6:7], coldcc at gotpcrel+4
+; GFX1250-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0
+; GFX1250-NEXT:    s_load_b64 s[12:13], s[6:7], 0x0
+; GFX1250-NEXT:    s_add_nc_u64 s[8:9], s[4:5], 36
+; GFX1250-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1250-NEXT:    s_mov_b32 s32, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[12:13]
+; GFX1250-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
   %val = call coldcc float @coldcc(float 1.0)
   store float %val, ptr addrspace(1) poison
   ret void
@@ -303,6 +385,23 @@ define amdgpu_kernel void @call_fastcc() #0 {
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: call_fastcc:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_mov_b64 s[10:11], s[6:7]
+; GFX1250-NEXT:    s_get_pc_i64 s[6:7]
+; GFX1250-NEXT:    s_add_nc_u64 s[6:7], s[6:7], fastcc at gotpcrel+4
+; GFX1250-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, 1.0
+; GFX1250-NEXT:    s_load_b64 s[12:13], s[6:7], 0x0
+; GFX1250-NEXT:    s_add_nc_u64 s[8:9], s[4:5], 36
+; GFX1250-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_mov_b64 s[6:7], s[2:3]
+; GFX1250-NEXT:    s_mov_b32 s32, 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_swap_pc_i64 s[30:31], s[12:13]
+; GFX1250-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
   %val = call fastcc float @fastcc(float 1.0)
   store float %val, ptr addrspace(1) poison
   ret void
@@ -331,6 +430,16 @@ define amdgpu_cs half @cs_mesa(half %arg0) {
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-TRUE16-LABEL: cs_mesa:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX1250-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-FAKE16-LABEL: cs_mesa:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
@@ -358,6 +467,16 @@ define amdgpu_ps half @ps_mesa_f16(half %arg0) {
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-TRUE16-LABEL: ps_mesa_f16:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX1250-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-FAKE16-LABEL: ps_mesa_f16:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
@@ -385,6 +504,16 @@ define amdgpu_vs half @vs_mesa(half %arg0) {
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-TRUE16-LABEL: vs_mesa:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX1250-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-FAKE16-LABEL: vs_mesa:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
@@ -412,6 +541,16 @@ define amdgpu_gs half @gs_mesa(half %arg0) {
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-TRUE16-LABEL: gs_mesa:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX1250-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-FAKE16-LABEL: gs_mesa:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
@@ -439,6 +578,16 @@ define amdgpu_hs half @hs_mesa(half %arg0) {
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
 ; GFX11-FAKE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-TRUE16-LABEL: hs_mesa:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    v_add_f16_e32 v0.l, 1.0, v0.l
+; GFX1250-TRUE16-NEXT:    ; return to shader part epilog
+;
+; GFX1250-FAKE16-LABEL: hs_mesa:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    v_add_f16_e32 v0, 1.0, v0
+; GFX1250-FAKE16-NEXT:    ; return to shader part epilog
   %add = fadd half %arg0, 1.0
   ret half %add
 }
@@ -468,6 +617,11 @@ define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: ps_mesa_v2f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT:    ; return to shader part epilog
   %add = fadd <2 x half> %arg0, <half 1.0, half 1.0>
   ret <2 x half> %add
 }
@@ -497,6 +651,11 @@ define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0]
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: ps_mesa_inreg_v2f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT:    ; return to shader part epilog
   %add = fadd <2 x half> %arg0, <half 1.0, half 1.0>
   ret <2 x half> %add
 }
@@ -528,6 +687,12 @@ define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) {
 ; GFX11-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_v2i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_pk_add_u16 v0, v0, 1 op_sel_hi:[1,0]
+; GFX1250-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
   %add = add <2 x i16> %arg0, <i16 1, i16 1>
   store <2 x i16> %add, ptr addrspace(1) poison
   ret void
@@ -563,6 +728,12 @@ define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) {
 ; GFX11-NEXT:    v_pk_add_u16 v0, s0, 1 op_sel_hi:[1,0]
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_inreg_v2i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_pk_add_u16 v0, s0, 1 op_sel_hi:[1,0]
+; GFX1250-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
   %add = add <2 x i16> %arg0, <i16 1, i16 1>
   store <2 x i16> %add, ptr addrspace(1) poison
   ret void
@@ -603,6 +774,12 @@ define amdgpu_ps <4 x half> @ps_mesa_v4f16(<4 x half> %arg0) {
 ; GFX11-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
 ; GFX11-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: ps_mesa_v4f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_pk_add_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT:    v_pk_add_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT:    ; return to shader part epilog
   %add = fadd <4 x half> %arg0, <half 1.0, half 1.0, half 1.0, half 1.0>
   ret <4 x half> %add
 }
@@ -644,6 +821,12 @@ define amdgpu_ps <4 x half> @ps_mesa_inreg_v4f16(<4 x half> inreg %arg0) {
 ; GFX11-NEXT:    v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0]
 ; GFX11-NEXT:    v_pk_add_f16 v1, s1, 1.0 op_sel_hi:[1,0]
 ; GFX11-NEXT:    ; return to shader part epilog
+;
+; GFX1250-LABEL: ps_mesa_inreg_v4f16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_pk_add_f16 v0, s0, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT:    v_pk_add_f16 v1, s1, 1.0 op_sel_hi:[1,0]
+; GFX1250-NEXT:    ; return to shader part epilog
   %add = fadd <4 x half> %arg0, <half 1.0, half 1.0, half 1.0, half 1.0>
   ret <4 x half> %add
 }
@@ -685,6 +868,17 @@ define amdgpu_ps void @ps_mesa_inreg_v3i32(<3 x i32> inreg %arg0) {
 ; GFX11-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_inreg_v3i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_add_co_i32 s2, s2, 3
+; GFX1250-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-NEXT:    s_add_co_i32 s1, s1, 2
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    global_store_b96 v[0:1], v[0:2], off
+; GFX1250-NEXT:    s_endpgm
   %add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3>
   store <3 x i32> %add, ptr addrspace(1) poison
   ret void
@@ -717,6 +911,17 @@ define amdgpu_ps void @ps_mesa_inreg_v3f32(<3 x float> inreg %arg0) {
 ; GFX11-NEXT:    v_add_f32_e64 v0, s0, 1.0
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_inreg_v3f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_add_f32 s0, s0, 1.0
+; GFX1250-NEXT:    s_add_f32 s1, s1, 2.0
+; GFX1250-NEXT:    s_add_f32 s2, s2, 4.0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_2) | instskip(NEXT) | instid1(SALU_CYCLE_2)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s2
+; GFX1250-NEXT:    global_store_b96 v[0:1], v[0:2], off
+; GFX1250-NEXT:    s_endpgm
   %add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0>
   store <3 x float> %add, ptr addrspace(1) poison
   ret void
@@ -772,6 +977,22 @@ define amdgpu_ps void @ps_mesa_inreg_v5i32(<5 x i32> inreg %arg0) {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_inreg_v5i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_add_co_i32 s3, s3, 4
+; GFX1250-NEXT:    s_add_co_i32 s2, s2, 3
+; GFX1250-NEXT:    s_add_co_i32 s1, s1, 2
+; GFX1250-NEXT:    s_add_co_i32 s4, s4, 5
+; GFX1250-NEXT:    s_add_co_i32 s0, s0, 1
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1250-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX1250-NEXT:    s_endpgm
   %add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5>
   store <5 x i32> %add, ptr addrspace(1) poison
   ret void
@@ -813,6 +1034,22 @@ define amdgpu_ps void @ps_mesa_inreg_v5f32(<5 x float> inreg %arg0) {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_inreg_v5f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_add_f32 s3, s3, -1.0
+; GFX1250-NEXT:    s_add_f32 s4, s4, 0.5
+; GFX1250-NEXT:    s_add_f32 s0, s0, 1.0
+; GFX1250-NEXT:    s_add_f32 s1, s1, 2.0
+; GFX1250-NEXT:    s_add_f32 s2, s2, 4.0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_2)
+; GFX1250-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v0, s0
+; GFX1250-NEXT:    v_dual_mov_b32 v1, s1 :: v_dual_mov_b32 v2, s2
+; GFX1250-NEXT:    v_mov_b32_e32 v3, s3
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1250-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX1250-NEXT:    s_endpgm
   %add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>
   store <5 x float> %add, ptr addrspace(1) poison
   ret void
@@ -845,6 +1082,13 @@ define amdgpu_ps void @ps_mesa_v3i32(<3 x i32> %arg0) {
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, 1, v0
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_v3i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_add_nc_u32 v2, 3, v2 :: v_dual_add_nc_u32 v1, 2, v1
+; GFX1250-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1250-NEXT:    global_store_b96 v[0:1], v[0:2], off
+; GFX1250-NEXT:    s_endpgm
   %add = add <3 x i32> %arg0, <i32 1, i32 2, i32 3>
   store <3 x i32> %add, ptr addrspace(1) poison
   ret void
@@ -876,6 +1120,13 @@ define amdgpu_ps void @ps_mesa_v3f32(<3 x float> %arg0) {
 ; GFX11-NEXT:    v_add_f32_e32 v0, 1.0, v0
 ; GFX11-NEXT:    global_store_b96 v[0:1], v[0:2], off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_v3f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_add_f32 v2, 4.0, v2 :: v_dual_add_f32 v1, 2.0, v1
+; GFX1250-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX1250-NEXT:    global_store_b96 v[0:1], v[0:2], off
+; GFX1250-NEXT:    s_endpgm
   %add = fadd <3 x float> %arg0, <float 1.0, float 2.0, float 4.0>
   store <3 x float> %add, ptr addrspace(1) poison
   ret void
@@ -917,6 +1168,16 @@ define amdgpu_ps void @ps_mesa_v5i32(<5 x i32> %arg0) {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_v5i32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_add_nc_u32 v3, 4, v3 :: v_dual_add_nc_u32 v2, 3, v2
+; GFX1250-NEXT:    v_dual_add_nc_u32 v1, 2, v1 :: v_dual_add_nc_u32 v4, 5, v4
+; GFX1250-NEXT:    v_add_nc_u32_e32 v0, 1, v0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1250-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX1250-NEXT:    s_endpgm
   %add = add <5 x i32> %arg0, <i32 1, i32 2, i32 3, i32 4, i32 5>
   store <5 x i32> %add, ptr addrspace(1) poison
   ret void
@@ -956,6 +1217,16 @@ define amdgpu_ps void @ps_mesa_v5f32(<5 x float> %arg0) {
 ; GFX11-NEXT:    global_store_b32 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_v5f32:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_dual_add_f32 v3, -1.0, v3 :: v_dual_add_f32 v2, 4.0, v2
+; GFX1250-NEXT:    v_dual_add_f32 v1, 2.0, v1 :: v_dual_add_f32 v4, 0.5, v4
+; GFX1250-NEXT:    v_add_f32_e32 v0, 1.0, v0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b32 v[0:1], v4, off
+; GFX1250-NEXT:    global_store_b128 v[0:1], v[0:3], off
+; GFX1250-NEXT:    s_endpgm
   %add = fadd <5 x float> %arg0, <float 1.0, float 2.0, float 4.0, float -1.0, float 0.5>
   store <5 x float> %add, ptr addrspace(1) poison
   ret void
@@ -987,6 +1258,18 @@ define amdgpu_ps void @ps_mesa_i16(i16 %arg0) {
 ; GFX11-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v0
 ; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX1250-TRUE16-LABEL: ps_mesa_i16:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    v_add_nc_u16 v0.l, v0.l, v0.l
+; GFX1250-TRUE16-NEXT:    flat_store_b16 v[0:1], v0
+; GFX1250-TRUE16-NEXT:    s_endpgm
+;
+; GFX1250-FAKE16-LABEL: ps_mesa_i16:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    v_add_nc_u16 v0, v0, v0
+; GFX1250-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX1250-FAKE16-NEXT:    s_endpgm
   %add = add i16 %arg0, %arg0
   store i16 %add, ptr addrspace(1) poison
   ret void
@@ -1016,6 +1299,14 @@ define amdgpu_ps void @ps_mesa_inreg_i16(i16 inreg %arg0) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: ps_mesa_inreg_i16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
   %add = add i16 %arg0, %arg0
   store i16 %add, ptr addrspace(1) poison
   ret void
@@ -1059,6 +1350,16 @@ define amdgpu_kernel void @amd_kernel_i8(i8 %arg0) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: amd_kernel_i8:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x24
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
 entry:
   %add = add i8 %arg0, %arg0
   store i8 %add, ptr addrspace(1) poison
@@ -1114,6 +1415,22 @@ define amdgpu_kernel void @amd_kernel_v2i8(<2 x i8> %arg0) {
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: amd_kernel_v2i8:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_bfe_u32 s1, s0, 0x80008
+; GFX1250-NEXT:    s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT:    s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_or_b32 s0, s0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    global_store_b16 v[0:1], v2, off
+; GFX1250-NEXT:    s_endpgm
 entry:
   %add = add <2 x i8> %arg0, %arg0
   store <2 x i8> %add, ptr addrspace(1) null
@@ -1199,6 +1516,32 @@ define amdgpu_kernel void @amd_kernel_v4i8(<4 x i8> %arg0) {
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v2, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: amd_kernel_v4i8:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX1250-NEXT:    s_lshr_b32 s2, s0, 24
+; GFX1250-NEXT:    s_add_co_i32 s3, s0, s0
+; GFX1250-NEXT:    s_bfe_u32 s0, s0, 0x80008
+; GFX1250-NEXT:    s_add_co_i32 s2, s2, s2
+; GFX1250-NEXT:    s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT:    s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX1250-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX1250-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX1250-NEXT:    s_or_b32 s0, s3, s0
+; GFX1250-NEXT:    s_or_b32 s1, s1, s2
+; GFX1250-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_or_b32 s0, s0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v2, s0
+; GFX1250-NEXT:    global_store_b32 v[0:1], v2, off
+; GFX1250-NEXT:    s_endpgm
 entry:
   %add = add <4 x i8> %arg0, %arg0
   store <4 x i8> %add, ptr addrspace(1) null
@@ -1271,6 +1614,27 @@ define amdgpu_kernel void @amd_kernel_v3i8(<3 x i8> %arg0) {
 ; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b16 v[2:3], v5, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: amd_kernel_v3i8:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b32 s0, s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 2
+; GFX1250-NEXT:    v_mov_b64_e32 v[2:3], 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_bfe_u32 s2, s0, 0x80008
+; GFX1250-NEXT:    s_lshr_b32 s1, s0, 16
+; GFX1250-NEXT:    s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT:    s_add_co_i32 s2, s2, s2
+; GFX1250-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s2, s2, 8
+; GFX1250-NEXT:    s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT:    s_or_b32 s0, s0, s2
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b8 v[0:1], v4, off
+; GFX1250-NEXT:    global_store_b16 v[2:3], v5, off
+; GFX1250-NEXT:    s_endpgm
 entry:
   %add = add <3 x i8> %arg0, %arg0
   store <3 x i8> %add, ptr addrspace(1) null
@@ -1370,6 +1734,36 @@ define amdgpu_kernel void @amd_kernel_v5i8(<5 x i8> %arg0) {
 ; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
 ; GFX11-NEXT:    global_store_b32 v[2:3], v5, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: amd_kernel_v5i8:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 4
+; GFX1250-NEXT:    v_mov_b64_e32 v[2:3], 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX1250-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX1250-NEXT:    s_add_co_i32 s4, s0, s0
+; GFX1250-NEXT:    s_bfe_u32 s0, s0, 0x80008
+; GFX1250-NEXT:    s_add_co_i32 s3, s3, s3
+; GFX1250-NEXT:    s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT:    s_add_co_i32 s2, s2, s2
+; GFX1250-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s0, s0, 8
+; GFX1250-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX1250-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX1250-NEXT:    s_or_b32 s0, s4, s0
+; GFX1250-NEXT:    s_or_b32 s2, s2, s3
+; GFX1250-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX1250-NEXT:    s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT:    s_or_b32 s0, s0, s2
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v4, s1 :: v_dual_mov_b32 v5, s0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b8 v[0:1], v4, off
+; GFX1250-NEXT:    global_store_b32 v[2:3], v5, off
+; GFX1250-NEXT:    s_endpgm
 entry:
   %add = add <5 x i8> %arg0, %arg0
   store <5 x i8> %add, ptr addrspace(1) null
@@ -1505,6 +1899,48 @@ define amdgpu_kernel void @amd_kernel_v8i8(<8 x i8> %arg0) {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    global_store_b64 v[2:3], v[0:1], off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: amd_kernel_v8i8:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b64_e32 v[0:1], 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX1250-NEXT:    s_lshr_b32 s3, s0, 24
+; GFX1250-NEXT:    s_lshr_b32 s4, s1, 16
+; GFX1250-NEXT:    s_lshr_b32 s5, s1, 24
+; GFX1250-NEXT:    s_bfe_u32 s6, s0, 0x80008
+; GFX1250-NEXT:    s_bfe_u32 s7, s1, 0x80008
+; GFX1250-NEXT:    s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT:    s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT:    s_add_co_i32 s7, s7, s7
+; GFX1250-NEXT:    s_add_co_i32 s5, s5, s5
+; GFX1250-NEXT:    s_add_co_i32 s4, s4, s4
+; GFX1250-NEXT:    s_add_co_i32 s6, s6, s6
+; GFX1250-NEXT:    s_add_co_i32 s3, s3, s3
+; GFX1250-NEXT:    s_add_co_i32 s2, s2, s2
+; GFX1250-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX1250-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX1250-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX1250-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX1250-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX1250-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX1250-NEXT:    s_or_b32 s1, s1, s7
+; GFX1250-NEXT:    s_or_b32 s4, s4, s5
+; GFX1250-NEXT:    s_or_b32 s0, s0, s6
+; GFX1250-NEXT:    s_or_b32 s2, s2, s3
+; GFX1250-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s3, s4, 16
+; GFX1250-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX1250-NEXT:    s_or_b32 s1, s1, s3
+; GFX1250-NEXT:    s_or_b32 s0, s0, s2
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX1250-NEXT:    global_store_b64 v[0:1], v[2:3], off
+; GFX1250-NEXT:    s_endpgm
 entry:
   %add = add <8 x i8> %arg0, %arg0
   store <8 x i8> %add, ptr addrspace(1) null
@@ -1740,6 +2176,81 @@ define amdgpu_kernel void @amd_kernel_v16i8(<16 x i8> %arg0) {
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v5, 0
 ; GFX11-NEXT:    global_store_b128 v[4:5], v[0:3], off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: amd_kernel_v16i8:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b64_e32 v[4:5], 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_lshr_b32 s6, s1, 16
+; GFX1250-NEXT:    s_lshr_b32 s7, s1, 24
+; GFX1250-NEXT:    s_lshr_b32 s8, s2, 16
+; GFX1250-NEXT:    s_lshr_b32 s9, s2, 24
+; GFX1250-NEXT:    s_lshr_b32 s10, s3, 16
+; GFX1250-NEXT:    s_lshr_b32 s11, s3, 24
+; GFX1250-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX1250-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX1250-NEXT:    s_bfe_u32 s12, s0, 0x80008
+; GFX1250-NEXT:    s_bfe_u32 s13, s1, 0x80008
+; GFX1250-NEXT:    s_bfe_u32 s14, s2, 0x80008
+; GFX1250-NEXT:    s_bfe_u32 s15, s3, 0x80008
+; GFX1250-NEXT:    s_add_co_i32 s11, s11, s11
+; GFX1250-NEXT:    s_add_co_i32 s10, s10, s10
+; GFX1250-NEXT:    s_add_co_i32 s9, s9, s9
+; GFX1250-NEXT:    s_add_co_i32 s8, s8, s8
+; GFX1250-NEXT:    s_add_co_i32 s7, s7, s7
+; GFX1250-NEXT:    s_add_co_i32 s6, s6, s6
+; GFX1250-NEXT:    s_add_co_i32 s3, s3, s3
+; GFX1250-NEXT:    s_add_co_i32 s2, s2, s2
+; GFX1250-NEXT:    s_add_co_i32 s15, s15, s15
+; GFX1250-NEXT:    s_add_co_i32 s14, s14, s14
+; GFX1250-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX1250-NEXT:    s_and_b32 s10, s10, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s9, s9, 8
+; GFX1250-NEXT:    s_and_b32 s8, s8, 0xff
+; GFX1250-NEXT:    s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT:    s_add_co_i32 s13, s13, s13
+; GFX1250-NEXT:    s_lshl_b32 s7, s7, 8
+; GFX1250-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX1250-NEXT:    s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT:    s_add_co_i32 s12, s12, s12
+; GFX1250-NEXT:    s_add_co_i32 s5, s5, s5
+; GFX1250-NEXT:    s_add_co_i32 s4, s4, s4
+; GFX1250-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX1250-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s15, s15, 8
+; GFX1250-NEXT:    s_or_b32 s10, s10, s11
+; GFX1250-NEXT:    s_lshl_b32 s11, s14, 8
+; GFX1250-NEXT:    s_or_b32 s8, s8, s9
+; GFX1250-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s9, s13, 8
+; GFX1250-NEXT:    s_or_b32 s6, s6, s7
+; GFX1250-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s7, s12, 8
+; GFX1250-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX1250-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX1250-NEXT:    s_or_b32 s3, s3, s15
+; GFX1250-NEXT:    s_or_b32 s2, s2, s11
+; GFX1250-NEXT:    s_or_b32 s1, s1, s9
+; GFX1250-NEXT:    s_or_b32 s0, s0, s7
+; GFX1250-NEXT:    s_or_b32 s4, s4, s5
+; GFX1250-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s10, s10, 16
+; GFX1250-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX1250-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX1250-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX1250-NEXT:    s_lshl_b32 s5, s6, 16
+; GFX1250-NEXT:    s_or_b32 s3, s3, s10
+; GFX1250-NEXT:    s_or_b32 s2, s2, s8
+; GFX1250-NEXT:    s_or_b32 s0, s0, s4
+; GFX1250-NEXT:    s_or_b32 s1, s1, s5
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX1250-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX1250-NEXT:    global_store_b128 v[4:5], v[0:3], off
+; GFX1250-NEXT:    s_endpgm
 entry:
   %add = add <16 x i8> %arg0, %arg0
   store <16 x i8> %add, ptr addrspace(1) null
@@ -2186,6 +2697,149 @@ define amdgpu_kernel void @amd_kernel_v32i8(<32 x i8> %arg0) {
 ; GFX11-NEXT:    global_store_b128 v[8:9], v[0:3], off
 ; GFX11-NEXT:    global_store_b128 v[10:11], v[4:7], off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: amd_kernel_v32i8:
+; GFX1250:       ; %bb.0: ; %entry
+; GFX1250-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
+; GFX1250-NEXT:    v_mov_b64_e32 v[8:9], 16
+; GFX1250-NEXT:    v_mov_b64_e32 v[10:11], 0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_lshr_b32 s16, s0, 16
+; GFX1250-NEXT:    s_lshr_b32 s17, s0, 24
+; GFX1250-NEXT:    s_lshr_b32 s20, s2, 16
+; GFX1250-NEXT:    s_lshr_b32 s21, s2, 24
+; GFX1250-NEXT:    s_lshr_b32 s14, s7, 16
+; GFX1250-NEXT:    s_lshr_b32 s15, s7, 24
+; GFX1250-NEXT:    s_bfe_u32 s27, s7, 0x80008
+; GFX1250-NEXT:    s_add_co_i32 s17, s17, s17
+; GFX1250-NEXT:    s_add_co_i32 s16, s16, s16
+; GFX1250-NEXT:    s_lshr_b32 s18, s1, 16
+; GFX1250-NEXT:    s_lshr_b32 s19, s1, 24
+; GFX1250-NEXT:    s_lshr_b32 s22, s3, 16
+; GFX1250-NEXT:    s_lshr_b32 s23, s3, 24
+; GFX1250-NEXT:    s_bfe_u32 s29, s1, 0x80008
+; GFX1250-NEXT:    s_bfe_u32 s30, s3, 0x80008
+; GFX1250-NEXT:    s_add_co_i32 s21, s21, s21
+; GFX1250-NEXT:    s_add_co_i32 s20, s20, s20
+; GFX1250-NEXT:    s_lshl_b32 s17, s17, 8
+; GFX1250-NEXT:    s_and_b32 s16, s16, 0xff
+; GFX1250-NEXT:    s_add_co_i32 s7, s7, s7
+; GFX1250-NEXT:    s_add_co_i32 s27, s27, s27
+; GFX1250-NEXT:    s_add_co_i32 s15, s15, s15
+; GFX1250-NEXT:    s_add_co_i32 s14, s14, s14
+; GFX1250-NEXT:    s_add_co_i32 s3, s3, s3
+; GFX1250-NEXT:    s_add_co_i32 s30, s30, s30
+; GFX1250-NEXT:    s_add_co_i32 s23, s23, s23
+; GFX1250-NEXT:    s_add_co_i32 s22, s22, s22
+; GFX1250-NEXT:    s_lshl_b32 s21, s21, 8
+; GFX1250-NEXT:    s_and_b32 s20, s20, 0xff
+; GFX1250-NEXT:    s_add_co_i32 s1, s1, s1
+; GFX1250-NEXT:    s_add_co_i32 s29, s29, s29
+; GFX1250-NEXT:    s_add_co_i32 s19, s19, s19
+; GFX1250-NEXT:    s_add_co_i32 s18, s18, s18
+; GFX1250-NEXT:    s_lshr_b32 s10, s5, 16
+; GFX1250-NEXT:    s_lshr_b32 s11, s5, 24
+; GFX1250-NEXT:    s_lshr_b32 s12, s6, 16
+; GFX1250-NEXT:    s_lshr_b32 s13, s6, 24
+; GFX1250-NEXT:    s_or_b32 s16, s16, s17
+; GFX1250-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s17, s27, 8
+; GFX1250-NEXT:    s_lshl_b32 s15, s15, 8
+; GFX1250-NEXT:    s_and_b32 s14, s14, 0xff
+; GFX1250-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s30, s30, 8
+; GFX1250-NEXT:    s_lshl_b32 s23, s23, 8
+; GFX1250-NEXT:    s_and_b32 s22, s22, 0xff
+; GFX1250-NEXT:    s_or_b32 s20, s20, s21
+; GFX1250-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s21, s29, 8
+; GFX1250-NEXT:    s_lshl_b32 s19, s19, 8
+; GFX1250-NEXT:    s_and_b32 s18, s18, 0xff
+; GFX1250-NEXT:    s_lshr_b32 s8, s4, 16
+; GFX1250-NEXT:    s_lshr_b32 s9, s4, 24
+; GFX1250-NEXT:    s_bfe_u32 s24, s4, 0x80008
+; GFX1250-NEXT:    s_bfe_u32 s25, s5, 0x80008
+; GFX1250-NEXT:    s_bfe_u32 s26, s6, 0x80008
+; GFX1250-NEXT:    s_or_b32 s7, s7, s17
+; GFX1250-NEXT:    s_or_b32 s14, s14, s15
+; GFX1250-NEXT:    s_add_co_i32 s13, s13, s13
+; GFX1250-NEXT:    s_add_co_i32 s12, s12, s12
+; GFX1250-NEXT:    s_add_co_i32 s11, s11, s11
+; GFX1250-NEXT:    s_add_co_i32 s10, s10, s10
+; GFX1250-NEXT:    s_bfe_u32 s28, s0, 0x80008
+; GFX1250-NEXT:    s_or_b32 s3, s3, s30
+; GFX1250-NEXT:    s_or_b32 s22, s22, s23
+; GFX1250-NEXT:    s_bfe_u32 s23, s2, 0x80008
+; GFX1250-NEXT:    s_or_b32 s1, s1, s21
+; GFX1250-NEXT:    s_or_b32 s18, s18, s19
+; GFX1250-NEXT:    s_and_b32 s7, s7, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s14, s14, 16
+; GFX1250-NEXT:    s_add_co_i32 s6, s6, s6
+; GFX1250-NEXT:    s_add_co_i32 s26, s26, s26
+; GFX1250-NEXT:    s_lshl_b32 s13, s13, 8
+; GFX1250-NEXT:    s_and_b32 s12, s12, 0xff
+; GFX1250-NEXT:    s_add_co_i32 s5, s5, s5
+; GFX1250-NEXT:    s_add_co_i32 s25, s25, s25
+; GFX1250-NEXT:    s_lshl_b32 s11, s11, 8
+; GFX1250-NEXT:    s_and_b32 s10, s10, 0xff
+; GFX1250-NEXT:    s_add_co_i32 s4, s4, s4
+; GFX1250-NEXT:    s_add_co_i32 s24, s24, s24
+; GFX1250-NEXT:    s_add_co_i32 s9, s9, s9
+; GFX1250-NEXT:    s_add_co_i32 s8, s8, s8
+; GFX1250-NEXT:    s_and_b32 s3, s3, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s22, s22, 16
+; GFX1250-NEXT:    s_add_co_i32 s2, s2, s2
+; GFX1250-NEXT:    s_add_co_i32 s23, s23, s23
+; GFX1250-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s18, s18, 16
+; GFX1250-NEXT:    s_add_co_i32 s0, s0, s0
+; GFX1250-NEXT:    s_add_co_i32 s28, s28, s28
+; GFX1250-NEXT:    s_or_b32 s7, s7, s14
+; GFX1250-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s14, s26, 8
+; GFX1250-NEXT:    s_or_b32 s12, s12, s13
+; GFX1250-NEXT:    s_and_b32 s5, s5, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s13, s25, 8
+; GFX1250-NEXT:    s_or_b32 s10, s10, s11
+; GFX1250-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s11, s24, 8
+; GFX1250-NEXT:    s_lshl_b32 s9, s9, 8
+; GFX1250-NEXT:    s_and_b32 s8, s8, 0xff
+; GFX1250-NEXT:    s_or_b32 s3, s3, s22
+; GFX1250-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s22, s23, 8
+; GFX1250-NEXT:    s_or_b32 s1, s1, s18
+; GFX1250-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX1250-NEXT:    s_lshl_b32 s18, s28, 8
+; GFX1250-NEXT:    s_or_b32 s6, s6, s14
+; GFX1250-NEXT:    s_or_b32 s5, s5, s13
+; GFX1250-NEXT:    s_or_b32 s4, s4, s11
+; GFX1250-NEXT:    s_or_b32 s8, s8, s9
+; GFX1250-NEXT:    s_or_b32 s2, s2, s22
+; GFX1250-NEXT:    s_or_b32 s0, s0, s18
+; GFX1250-NEXT:    s_and_b32 s6, s6, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s12, s12, 16
+; GFX1250-NEXT:    s_and_b32 s5, s5, 0xffff
+; GFX1250-NEXT:    s_and_b32 s4, s4, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s8, s8, 16
+; GFX1250-NEXT:    s_lshl_b32 s9, s10, 16
+; GFX1250-NEXT:    s_and_b32 s2, s2, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s20, s20, 16
+; GFX1250-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s16, s16, 16
+; GFX1250-NEXT:    s_or_b32 s6, s6, s12
+; GFX1250-NEXT:    s_or_b32 s4, s4, s8
+; GFX1250-NEXT:    s_or_b32 s5, s5, s9
+; GFX1250-NEXT:    s_or_b32 s2, s2, s20
+; GFX1250-NEXT:    s_or_b32 s0, s0, s16
+; GFX1250-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX1250-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX1250-NEXT:    v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v[8:9], v[0:3], off
+; GFX1250-NEXT:    global_store_b128 v[10:11], v[4:7], off
+; GFX1250-NEXT:    s_endpgm
 entry:
   %add = add <32 x i8> %arg0, %arg0
   store <32 x i8> %add, ptr addrspace(1) null
@@ -2212,6 +2866,12 @@ define amdgpu_cs void @amdgpu_cs_i1(i1 %arg0) {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: amdgpu_cs_i1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX1250-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
   store i1 %arg0, ptr addrspace(1) poison
   ret void
 }
@@ -2330,6 +2990,56 @@ define amdgpu_cs void @amdgpu_cs_v8i1(<8 x i1> %arg0) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX1250-TRUE16-LABEL: amdgpu_cs_v8i1:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 3, v3.l
+; GFX1250-TRUE16-NEXT:    v_and_b16 v3.l, v6.l, 1
+; GFX1250-TRUE16-NEXT:    v_and_b16 v2.l, v2.l, 1
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 3, v7.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 1, v5.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 1, v1.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 2, v3.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 2, v2.l
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v1.l, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v3.l
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v1.h, v4.l, v3.h, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT:    v_or_b16 v2.l, v2.h, v2.l
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.h, v1.h, v0.h, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v2.l, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 4, v0.h
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v0.h, 15 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    flat_store_b8 v[0:1], v0
+; GFX1250-TRUE16-NEXT:    s_endpgm
+;
+; GFX1250-FAKE16-LABEL: amdgpu_cs_v8i1:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v7, 3, v7
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v5, 1, v5
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v3, 3, v3
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v6, 2, v6
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v2, 2, v2
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v1, 1, v1
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v4, v4, v5, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v6
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v1, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v1, v4, v6, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v2, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v1, 4, v1
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v1, 15 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX1250-FAKE16-NEXT:    s_endpgm
   store <8 x i1> %arg0, ptr addrspace(1) poison
   ret void
 }
@@ -2545,6 +3255,94 @@ define amdgpu_cs void @amdgpu_cs_v16i1(<16 x i1> %arg0) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX1250-TRUE16-LABEL: amdgpu_cs_v16i1:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    v_and_b16 v2.h, v6.l, 1
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 3, v7.l
+; GFX1250-TRUE16-NEXT:    v_and_b16 v4.h, v10.l, 1
+; GFX1250-TRUE16-NEXT:    v_and_b16 v2.l, v2.l, 1
+; GFX1250-TRUE16-NEXT:    v_and_b16 v3.h, v8.l, 1
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 2, v2.h
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 3, v11.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v4.h, 2, v4.h
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 3, v15.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 1, v13.l
+; GFX1250-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v2.h
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 1, v5.l
+; GFX1250-TRUE16-NEXT:    v_and_b16 v5.l, v14.l, 1
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 1, v9.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 3, v3.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 2, v2.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 1, v1.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 2, v5.l
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v1.h, v4.l, v1.h, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v2.h, v3.h, 3, v2.h bitop3:0xc8
+; GFX1250-TRUE16-NEXT:    v_or_b16 v3.h, v5.h, v4.h
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v4.h, v12.l, v6.h, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_or_b16 v4.l, v6.l, v5.l
+; GFX1250-TRUE16-NEXT:    v_or_b16 v2.l, v3.l, v2.l
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v1.l, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.h, v1.h, v0.h, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v1.l, v2.h, 15, v3.h bitop3:0xc8
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v1.h, v4.h, v4.l, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v2.l, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 4, v0.h
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 12, v1.h
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v0.h, 15 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v1.l
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v0.h, 0xff bitop3:0xec
+; GFX1250-TRUE16-NEXT:    flat_store_b16 v[0:1], v0
+; GFX1250-TRUE16-NEXT:    s_endpgm
+;
+; GFX1250-FAKE16-LABEL: amdgpu_cs_v16i1:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v7, 3, v7
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v5, 1, v5
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v9, 1, v9
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v6, 2, v6
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v8, 1, v8
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v11, 3, v11
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v13, 1, v13
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v3, 3, v3
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v6, v7, v6
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v7, 1, v10
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v10, 1, v14
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v14, 3, v15
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v2, 2, v2
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v1, 1, v1
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v7, 2, v7
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v10, 2, v10
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v4, v4, v5, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v5, v8, 3, v9 bitop3:0xc8
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v9, v12, v13, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v7, v11, v7
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v8, v14, v10
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v1, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v1, v4, v6, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v3, v5, 15, v7 bitop3:0xc8
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v4, v9, v8, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v2, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v1, 4, v1
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v2, 8, v3
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v3, 12, v4
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v1, 15 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v1, v3, v2
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v1, 0xff bitop3:0xec
+; GFX1250-FAKE16-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX1250-FAKE16-NEXT:    s_endpgm
   store <16 x i1> %arg0, ptr addrspace(1) poison
   ret void
 }
@@ -2945,6 +3743,170 @@ define amdgpu_cs void @amdgpu_cs_v32i1(<32 x i1> %arg0) {
 ; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-FAKE16-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-FAKE16-NEXT:    s_endpgm
+;
+; GFX1250-TRUE16-LABEL: amdgpu_cs_v32i1:
+; GFX1250-TRUE16:       ; %bb.0:
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 3, v19.l
+; GFX1250-TRUE16-NEXT:    v_and_b16 v19.l, v22.l, 1
+; GFX1250-TRUE16-NEXT:    v_and_b16 v18.l, v18.l, 1
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 3, v23.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 1, v21.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 1, v17.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 2, v19.l
+; GFX1250-TRUE16-NEXT:    v_and_b16 v3.h, v6.l, 1
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v18.l, 2, v18.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 3, v7.l
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v16.l, v16.l, v17.l, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_or_b16 v16.h, v17.h, v19.l
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v17.h, v20.l, v19.h, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_and_b16 v17.l, v26.l, 1
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 2, v3.h
+; GFX1250-TRUE16-NEXT:    v_or_b16 v18.l, v18.h, v18.l
+; GFX1250-TRUE16-NEXT:    v_and_b16 v18.h, v24.l, 1
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v16.h, v17.h, v16.h, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v17.h, 1, v25.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 3, v27.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 2, v17.l
+; GFX1250-TRUE16-NEXT:    v_and_b16 v19.h, v30.l, 1
+; GFX1250-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v3.h
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 1, v5.l
+; GFX1250-TRUE16-NEXT:    v_and_b16 v5.l, v10.l, 1
+; GFX1250-TRUE16-NEXT:    v_and_b16 v5.h, v14.l, 1
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v17.h, v18.h, 3, v17.h bitop3:0xc8
+; GFX1250-TRUE16-NEXT:    v_or_b16 v17.l, v19.l, v17.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v18.h, 3, v31.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v19.l, 2, v19.h
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v19.h, 1, v29.l
+; GFX1250-TRUE16-NEXT:    v_and_b16 v2.l, v2.l, 1
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v3.h, 1, v9.l
+; GFX1250-TRUE16-NEXT:    v_and_b16 v4.h, v8.l, 1
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v6.l, 3, v11.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v5.l, 2, v5.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v6.h, 3, v15.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v5.h, 2, v5.h
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v7.l, 1, v13.l
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v17.l, v17.h, 15, v17.l bitop3:0xc8
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v16.l, v16.l, v18.l, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_or_b16 v17.h, v18.h, v19.l
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v18.l, v28.l, v19.h, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v3.l, 3, v3.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v2.l, 2, v2.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 1, v1.l
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v1.h, v4.l, v1.h, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v3.h, v4.h, 3, v3.h bitop3:0xc8
+; GFX1250-TRUE16-NEXT:    v_or_b16 v4.l, v6.l, v5.l
+; GFX1250-TRUE16-NEXT:    v_or_b16 v4.h, v6.h, v5.h
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v5.l, v12.l, v7.l, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v2.h, v18.l, v17.h, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_or_b16 v2.l, v3.l, v2.l
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v1.l, 1 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.h, v1.h, v0.h, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v1.l, v3.h, 15, v4.l bitop3:0xc8
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v1.h, v5.l, v4.h, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v16.h, 4, v16.h
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v17.l, 8, v17.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v2.h, 12, v2.h
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v2.l, 3 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v0.h, 4, v0.h
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v1.l, 8, v1.l
+; GFX1250-TRUE16-NEXT:    v_lshlrev_b16 v1.h, 12, v1.h
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v2.l, v16.l, v16.h, 15 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_or_b16 v2.h, v2.h, v17.l
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v0.h, 15 bitop3:0xec
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1250-TRUE16-NEXT:    v_or_b16 v0.h, v1.h, v1.l
+; GFX1250-TRUE16-NEXT:    v_mov_b16_e32 v1.l, 0
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v1.h, v2.l, v2.h, 0xff bitop3:0xec
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-TRUE16-NEXT:    v_bitop3_b16 v0.l, v0.l, v0.h, 0xff bitop3:0xec
+; GFX1250-TRUE16-NEXT:    v_mov_b16_e32 v0.h, v1.l
+; GFX1250-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-TRUE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX1250-TRUE16-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX1250-TRUE16-NEXT:    s_endpgm
+;
+; GFX1250-FAKE16-LABEL: amdgpu_cs_v32i1:
+; GFX1250-FAKE16:       ; %bb.0:
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v2, 1, v2
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v6, 1, v6
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v3, 3, v3
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v7, 3, v7
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v5, 1, v5
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v2, 2, v2
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v6, 2, v6
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v1, 1, v1
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v4, v4, v5, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4)
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v3, v7, v6
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v6, 1, v10
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v5, 1, v9
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v7, 1, v8
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v8, 3, v11
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v3, v4, v3, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v6, 2, v6
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v1, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v4, v7, 3, v5 bitop3:0xc8
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v1, 1, v14
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v7, 3, v23
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v5, v8, v6
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v6, 3, v15
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v2, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v1, 2, v1
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v2, 1, v13
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v4, v4, 15, v5 bitop3:0xc8
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v5, 1, v22
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v9, 1, v26
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v11, 1, v30
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v1, v6, v1
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v2, v12, v2, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v5, 2, v5
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v6, 1, v18
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v10, 1, v25
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v12, 1, v24
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v13, 3, v27
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v7, 1, v21
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v9, 2, v9
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v14, 3, v31
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v11, 2, v11
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v15, 1, v29
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v1, v2, v1, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v2, 3, v19
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v6, 2, v6
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v8, 1, v17
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v7, v20, v7, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v10, v12, 3, v10 bitop3:0xc8
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v9, v13, v9
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v11, v14, v11
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v12, v28, v15, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v2, v2, v6
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v6, v16, v8, 1 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v5, v7, v5, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v7, v10, 15, v9 bitop3:0xc8
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v8, v12, v11, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v3, 4, v3
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v4, 8, v4
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v1, 12, v1
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v2, v6, v2, 3 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v5, 4, v5
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v6, 8, v7
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b16 v7, 12, v8
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v3, 15 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v1, v1, v4
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v2, v2, v5, 15 bitop3:0xec
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v3, v7, v6
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v0, v0, v1, 0xff bitop3:0xec
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-FAKE16-NEXT:    v_bitop3_b16 v1, v2, v3, 0xff bitop3:0xec
+; GFX1250-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-FAKE16-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX1250-FAKE16-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX1250-FAKE16-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX1250-FAKE16-NEXT:    s_endpgm
   store <32 x i1> %arg0, ptr addrspace(1) poison
   ret void
 }
@@ -2973,6 +3935,14 @@ define amdgpu_cs void @amdgpu_cs_inreg_i1(i1 inreg %arg0) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: amdgpu_cs_inreg_i1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_and_b32 s0, s0, 1
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
   store i1 %arg0, ptr addrspace(1) poison
   ret void
 }
@@ -3061,6 +4031,34 @@ define amdgpu_cs void @amdgpu_cs_inreg_v8i1(<8 x i1> inreg %arg0) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: amdgpu_cs_inreg_v8i1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_and_b32 s6, s6, 1
+; GFX1250-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX1250-NEXT:    s_and_b32 s4, s4, 1
+; GFX1250-NEXT:    s_and_b32 s2, s2, 1
+; GFX1250-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX1250-NEXT:    s_and_b32 s0, s0, 1
+; GFX1250-NEXT:    s_lshl_b32 s7, s7, 3
+; GFX1250-NEXT:    s_lshl_b32 s6, s6, 2
+; GFX1250-NEXT:    s_or_b32 s4, s4, s5
+; GFX1250-NEXT:    s_lshl_b32 s3, s3, 3
+; GFX1250-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX1250-NEXT:    s_or_b32 s0, s0, s1
+; GFX1250-NEXT:    s_or_b32 s5, s7, s6
+; GFX1250-NEXT:    s_and_b32 s4, s4, 3
+; GFX1250-NEXT:    s_or_b32 s1, s3, s2
+; GFX1250-NEXT:    s_and_b32 s0, s0, 3
+; GFX1250-NEXT:    s_or_b32 s2, s4, s5
+; GFX1250-NEXT:    s_or_b32 s0, s0, s1
+; GFX1250-NEXT:    s_lshl_b32 s1, s2, 4
+; GFX1250-NEXT:    s_and_b32 s0, s0, 15
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_or_b32 s0, s0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
   store <8 x i1> %arg0, ptr addrspace(1) poison
   ret void
 }
@@ -3221,6 +4219,58 @@ define amdgpu_cs void @amdgpu_cs_inreg_v16i1(<16 x i1> inreg %arg0) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b16 v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: amdgpu_cs_inreg_v16i1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_and_b32 s10, s10, 1
+; GFX1250-NEXT:    s_lshl_b32 s9, s9, 1
+; GFX1250-NEXT:    s_and_b32 s8, s8, 1
+; GFX1250-NEXT:    s_and_b32 s6, s6, 1
+; GFX1250-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX1250-NEXT:    s_and_b32 s4, s4, 1
+; GFX1250-NEXT:    s_and_b32 s2, s2, 1
+; GFX1250-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX1250-NEXT:    s_and_b32 s0, s0, 1
+; GFX1250-NEXT:    s_and_b32 s14, s14, 1
+; GFX1250-NEXT:    s_lshl_b32 s13, s13, 1
+; GFX1250-NEXT:    s_and_b32 s12, s12, 1
+; GFX1250-NEXT:    s_lshl_b32 s11, s11, 3
+; GFX1250-NEXT:    s_lshl_b32 s10, s10, 2
+; GFX1250-NEXT:    s_or_b32 s8, s8, s9
+; GFX1250-NEXT:    s_lshl_b32 s7, s7, 3
+; GFX1250-NEXT:    s_lshl_b32 s6, s6, 2
+; GFX1250-NEXT:    s_or_b32 s4, s4, s5
+; GFX1250-NEXT:    s_lshl_b32 s3, s3, 3
+; GFX1250-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX1250-NEXT:    s_or_b32 s0, s0, s1
+; GFX1250-NEXT:    s_lshl_b32 s15, s15, 3
+; GFX1250-NEXT:    s_lshl_b32 s14, s14, 2
+; GFX1250-NEXT:    s_or_b32 s12, s12, s13
+; GFX1250-NEXT:    s_or_b32 s9, s11, s10
+; GFX1250-NEXT:    s_and_b32 s8, s8, 3
+; GFX1250-NEXT:    s_or_b32 s5, s7, s6
+; GFX1250-NEXT:    s_and_b32 s4, s4, 3
+; GFX1250-NEXT:    s_or_b32 s1, s3, s2
+; GFX1250-NEXT:    s_and_b32 s0, s0, 3
+; GFX1250-NEXT:    s_or_b32 s13, s15, s14
+; GFX1250-NEXT:    s_and_b32 s12, s12, 3
+; GFX1250-NEXT:    s_or_b32 s8, s8, s9
+; GFX1250-NEXT:    s_or_b32 s2, s4, s5
+; GFX1250-NEXT:    s_or_b32 s0, s0, s1
+; GFX1250-NEXT:    s_or_b32 s10, s12, s13
+; GFX1250-NEXT:    s_and_b32 s8, s8, 15
+; GFX1250-NEXT:    s_lshl_b32 s1, s2, 4
+; GFX1250-NEXT:    s_and_b32 s0, s0, 15
+; GFX1250-NEXT:    s_lshl_b32 s9, s10, 12
+; GFX1250-NEXT:    s_lshl_b32 s2, s8, 8
+; GFX1250-NEXT:    s_or_b32 s0, s0, s1
+; GFX1250-NEXT:    s_or_b32 s1, s9, s2
+; GFX1250-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_or_b32 s0, s0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    global_store_b16 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
   store <16 x i1> %arg0, ptr addrspace(1) poison
   ret void
 }
@@ -3525,6 +4575,106 @@ define amdgpu_cs void @amdgpu_cs_inreg_v32i1(<32 x i1> inreg %arg0) {
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    global_store_b32 v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: amdgpu_cs_inreg_v32i1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_and_b32 s10, s10, 1
+; GFX1250-NEXT:    s_lshl_b32 s9, s9, 1
+; GFX1250-NEXT:    s_and_b32 s8, s8, 1
+; GFX1250-NEXT:    s_and_b32 s14, s14, 1
+; GFX1250-NEXT:    s_lshl_b32 s13, s13, 1
+; GFX1250-NEXT:    s_and_b32 s12, s12, 1
+; GFX1250-NEXT:    s_lshl_b32 s11, s11, 3
+; GFX1250-NEXT:    s_lshl_b32 s10, s10, 2
+; GFX1250-NEXT:    s_or_b32 s8, s8, s9
+; GFX1250-NEXT:    s_and_b32 s6, s6, 1
+; GFX1250-NEXT:    s_lshl_b32 s5, s5, 1
+; GFX1250-NEXT:    s_and_b32 s4, s4, 1
+; GFX1250-NEXT:    s_and_b32 s2, s2, 1
+; GFX1250-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX1250-NEXT:    s_and_b32 s0, s0, 1
+; GFX1250-NEXT:    s_lshl_b32 s15, s15, 3
+; GFX1250-NEXT:    s_lshl_b32 s14, s14, 2
+; GFX1250-NEXT:    s_or_b32 s12, s12, s13
+; GFX1250-NEXT:    s_or_b32 s9, s11, s10
+; GFX1250-NEXT:    s_and_b32 s8, s8, 3
+; GFX1250-NEXT:    s_lshl_b32 s7, s7, 3
+; GFX1250-NEXT:    s_lshl_b32 s6, s6, 2
+; GFX1250-NEXT:    s_or_b32 s4, s4, s5
+; GFX1250-NEXT:    s_lshl_b32 s3, s3, 3
+; GFX1250-NEXT:    s_lshl_b32 s2, s2, 2
+; GFX1250-NEXT:    s_or_b32 s0, s0, s1
+; GFX1250-NEXT:    s_or_b32 s13, s15, s14
+; GFX1250-NEXT:    s_and_b32 s12, s12, 3
+; GFX1250-NEXT:    s_or_b32 s8, s8, s9
+; GFX1250-NEXT:    s_or_b32 s5, s7, s6
+; GFX1250-NEXT:    s_and_b32 s4, s4, 3
+; GFX1250-NEXT:    s_or_b32 s1, s3, s2
+; GFX1250-NEXT:    s_and_b32 s0, s0, 3
+; GFX1250-NEXT:    s_or_b32 s10, s12, s13
+; GFX1250-NEXT:    s_and_b32 s8, s8, 15
+; GFX1250-NEXT:    s_or_b32 s2, s4, s5
+; GFX1250-NEXT:    s_or_b32 s0, s0, s1
+; GFX1250-NEXT:    s_lshl_b32 s9, s10, 12
+; GFX1250-NEXT:    s_lshl_b32 s1, s2, 4
+; GFX1250-NEXT:    s_and_b32 s0, s0, 15
+; GFX1250-NEXT:    s_lshl_b32 s2, s8, 8
+; GFX1250-NEXT:    s_and_b32 s3, s30, 1
+; GFX1250-NEXT:    s_lshl_b32 s4, s29, 1
+; GFX1250-NEXT:    s_and_b32 s5, s28, 1
+; GFX1250-NEXT:    s_or_b32 s0, s0, s1
+; GFX1250-NEXT:    s_or_b32 s1, s9, s2
+; GFX1250-NEXT:    s_lshl_b32 s2, s31, 3
+; GFX1250-NEXT:    s_lshl_b32 s3, s3, 2
+; GFX1250-NEXT:    s_or_b32 s4, s5, s4
+; GFX1250-NEXT:    s_and_b32 s5, s26, 1
+; GFX1250-NEXT:    s_lshl_b32 s6, s25, 1
+; GFX1250-NEXT:    s_and_b32 s7, s24, 1
+; GFX1250-NEXT:    s_or_b32 s2, s2, s3
+; GFX1250-NEXT:    s_and_b32 s3, s4, 3
+; GFX1250-NEXT:    s_lshl_b32 s4, s27, 3
+; GFX1250-NEXT:    s_lshl_b32 s5, s5, 2
+; GFX1250-NEXT:    s_or_b32 s6, s7, s6
+; GFX1250-NEXT:    s_or_b32 s4, s4, s5
+; GFX1250-NEXT:    s_and_b32 s5, s6, 3
+; GFX1250-NEXT:    s_or_b32 s2, s3, s2
+; GFX1250-NEXT:    s_or_b32 s3, s5, s4
+; GFX1250-NEXT:    s_and_b32 s5, s22, 1
+; GFX1250-NEXT:    s_lshl_b32 s6, s21, 1
+; GFX1250-NEXT:    s_and_b32 s7, s20, 1
+; GFX1250-NEXT:    s_lshl_b32 s4, s23, 3
+; GFX1250-NEXT:    s_lshl_b32 s5, s5, 2
+; GFX1250-NEXT:    s_or_b32 s6, s7, s6
+; GFX1250-NEXT:    s_and_b32 s7, s18, 1
+; GFX1250-NEXT:    s_lshl_b32 s8, s17, 1
+; GFX1250-NEXT:    s_and_b32 s9, s16, 1
+; GFX1250-NEXT:    s_or_b32 s4, s4, s5
+; GFX1250-NEXT:    s_and_b32 s5, s6, 3
+; GFX1250-NEXT:    s_lshl_b32 s6, s19, 3
+; GFX1250-NEXT:    s_lshl_b32 s7, s7, 2
+; GFX1250-NEXT:    s_or_b32 s8, s9, s8
+; GFX1250-NEXT:    s_or_b32 s6, s6, s7
+; GFX1250-NEXT:    s_and_b32 s7, s8, 3
+; GFX1250-NEXT:    s_or_b32 s4, s5, s4
+; GFX1250-NEXT:    s_or_b32 s5, s7, s6
+; GFX1250-NEXT:    s_and_b32 s3, s3, 15
+; GFX1250-NEXT:    s_lshl_b32 s4, s4, 4
+; GFX1250-NEXT:    s_and_b32 s5, s5, 15
+; GFX1250-NEXT:    s_lshl_b32 s2, s2, 12
+; GFX1250-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX1250-NEXT:    s_or_b32 s4, s5, s4
+; GFX1250-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX1250-NEXT:    s_or_b32 s2, s2, s3
+; GFX1250-NEXT:    s_and_b32 s3, s4, 0xff
+; GFX1250-NEXT:    s_or_b32 s0, s0, s1
+; GFX1250-NEXT:    s_or_b32 s1, s3, s2
+; GFX1250-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX1250-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_or_b32 s0, s0, s1
+; GFX1250-NEXT:    v_mov_b32_e32 v0, s0
+; GFX1250-NEXT:    global_store_b32 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
   store <32 x i1> %arg0, ptr addrspace(1) poison
   ret void
 }
@@ -3549,6 +4699,12 @@ define amdgpu_cs void @amdgpu_cs_i1_sext(i1 signext %arg0) {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: amdgpu_cs_i1_sext:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_and_b32_e32 v0, 1, v0
+; GFX1250-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
   store i1 %arg0, ptr addrspace(1) poison
   ret void
 }
@@ -3570,6 +4726,11 @@ define amdgpu_cs void @amdgpu_cs_i1_zext(i1 zeroext %arg0) {
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_store_b8 v[0:1], v0, off
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: amdgpu_cs_i1_zext:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    global_store_b8 v[0:1], v0, off
+; GFX1250-NEXT:    s_endpgm
   store i1 %arg0, ptr addrspace(1) poison
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/hard-clauses-load-monitor.mir b/llvm/test/CodeGen/AMDGPU/hard-clauses-load-monitor.mir
new file mode 100644
index 0000000000000..db4b946be25bb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/hard-clauses-load-monitor.mir
@@ -0,0 +1,38 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GCN
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1250 -run-pass si-insert-hard-clauses %s -o - | FileCheck %s -check-prefixes=GCN
+
+---
+name: async_load_flat_monitor_load
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GCN-LABEL: name: async_load_flat_monitor_load
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt
+    ; GCN-NEXT: $vgpr0 = FLAT_LOAD_MONITOR_B32 $vgpr0_vgpr1, 8, 0, implicit $exec, implicit $flat_scr
+    GLOBAL_LOAD_ASYNC_TO_LDS_B32 $vgpr2, $vgpr0_vgpr1, 0, 0, implicit-def $asynccnt, implicit $exec, implicit $asynccnt
+    $vgpr0 = FLAT_LOAD_MONITOR_B32 $vgpr0_vgpr1, 8, 0, implicit $exec, implicit $flat_scr
+...
+
+---
+name: flat_monitor_loads_flat_load
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+    ; GCN-LABEL: name: flat_monitor_loads_flat_load
+    ; GCN: liveins: $vgpr0, $vgpr1
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: BUNDLE implicit-def $vgpr4, implicit-def $vgpr3, implicit-def $vgpr0, implicit $vgpr0_vgpr1, implicit $exec, implicit $flat_scr {
+    ; GCN-NEXT:   S_CLAUSE 2
+    ; GCN-NEXT:   $vgpr4 = FLAT_LOAD_MONITOR_B32 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT:   $vgpr3 = FLAT_LOAD_MONITOR_B32 $vgpr0_vgpr1, 8, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT:   $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    ; GCN-NEXT: }
+    $vgpr4 = FLAT_LOAD_MONITOR_B32 $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+    $vgpr3 = FLAT_LOAD_MONITOR_B32 $vgpr0_vgpr1, 8, 0, implicit $exec, implicit $flat_scr
+    $vgpr0 = FLAT_LOAD_DWORD $vgpr0_vgpr1, 0, 0, implicit $exec, implicit $flat_scr
+...
diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
index d8c983a081b98..b81fdd36530da 100644
--- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2bf16.ll
@@ -3,6 +3,7 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tonga < %s | FileCheck -check-prefix=VI %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX900 %s
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck -check-prefix=GFX942 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
 
 define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr addrspace(4) %vec.ptr) #0 {
 ; SI-LABEL: s_insertelement_v2bf16_0:
@@ -61,6 +62,19 @@ define amdgpu_kernel void @s_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: s_insertelement_v2bf16_0:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_pack_ll_b32_b16 s2, 0x40a0, s2
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %vec = load <2 x bfloat>, ptr addrspace(4) %vec.ptr
   %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 0
   store <2 x bfloat> %vecins, ptr addrspace(1) %out
@@ -122,6 +136,18 @@ define amdgpu_kernel void @s_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
 ; GFX942-NEXT:    v_mov_b32_e32 v1, s2
 ; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: s_insertelement_v2bf16_1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_pack_ll_b32_b16 s2, s2, 0x40a0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %vec = load <2 x bfloat>, ptr addrspace(4) %vec.ptr
   %vecins = insertelement <2 x bfloat> %vec, bfloat 5.000000e+00, i32 1
   store <2 x bfloat> %vecins, ptr addrspace(1) %out
@@ -193,6 +219,19 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0(ptr addrspace(1) %out, ptr a
 ; GFX942-NEXT:    v_bfi_b32 v1, s2, v2, v1
 ; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v2bf16_0:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3] scale_offset
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s2, 0x40a0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_bfi_b32 v1, 0xffff, s2, v1
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scale_offset
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -266,6 +305,17 @@ define amdgpu_kernel void @v_insertelement_v2bf16_0_inlineimm(ptr addrspace(1) %
 ; GFX942-NEXT:    v_bfi_b32 v1, s2, 53, v1
 ; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v2bf16_0_inlineimm:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3] scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_bfi_b32 v1, 0xffff, 53, v1
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scale_offset
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -341,6 +391,19 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1(ptr addrspace(1) %out, ptr a
 ; GFX942-NEXT:    v_perm_b32 v1, s2, v1, v2
 ; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v2bf16_1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3] scale_offset
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_movk_i32 s2, 0x40a0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_perm_b32 v1, s2, v1, 0x5040100
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scale_offset
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -414,6 +477,17 @@ define amdgpu_kernel void @v_insertelement_v2bf16_1_inlineimm(ptr addrspace(1) %
 ; GFX942-NEXT:    v_perm_b32 v1, 35, v1, v2
 ; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v2bf16_1_inlineimm:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b32 v1, v0, s[2:3] scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_perm_b32 v1, 35, v1, 0x5040100
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scale_offset
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -514,6 +588,25 @@ define amdgpu_kernel void @v_insertelement_v2bf16_dynamic_vgpr(ptr addrspace(1)
 ; GFX942-NEXT:    v_bfi_b32 v1, v1, s2, v2
 ; GFX942-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v2bf16_dynamic_vgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x10
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_load_b32 v1, v0, s[6:7] scale_offset
+; GFX1250-NEXT:    global_load_b32 v2, v0, s[2:3] scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt 0x1
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v1, 4, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e64 v1, v1, 0xffff
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_bitop3_b32 v1, 0x12341234, v2, v1 bitop3:0xe4
+; GFX1250-NEXT:    global_store_b32 v0, v1, s[0:1] scale_offset
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -597,6 +690,19 @@ define amdgpu_kernel void @v_insertelement_v4bf16_0(ptr addrspace(1) %out, ptr a
 ; GFX942-NEXT:    v_bfi_b32 v0, s2, v3, v0
 ; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v4bf16_0:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_load_b32 s4, s[4:5], 0x30
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b64 v[0:1], v2, s[2:3] scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_bfi_b32 v0, 0xffff, s4, v0
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -678,6 +784,19 @@ define amdgpu_kernel void @v_insertelement_v4bf16_1(ptr addrspace(1) %out, ptr a
 ; GFX942-NEXT:    v_perm_b32 v0, s6, v0, v3
 ; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v4bf16_1:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_load_b32 s4, s[4:5], 0x10
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b64 v[0:1], v2, s[2:3] scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_perm_b32 v0, s4, v0, 0x5040100
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -761,6 +880,19 @@ define amdgpu_kernel void @v_insertelement_v4bf16_2(ptr addrspace(1) %out, ptr a
 ; GFX942-NEXT:    v_bfi_b32 v1, s2, v3, v1
 ; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v4bf16_2:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_load_b32 s4, s[4:5], 0x30
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b64 v[0:1], v2, s[2:3] scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_bfi_b32 v1, 0xffff, s4, v1
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -842,6 +974,19 @@ define amdgpu_kernel void @v_insertelement_v4bf16_3(ptr addrspace(1) %out, ptr a
 ; GFX942-NEXT:    v_perm_b32 v1, s6, v1, v3
 ; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v4bf16_3:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_load_b32 s4, s[4:5], 0x10
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b64 v[0:1], v2, s[2:3] scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_perm_b32 v1, s4, v1, 0x5040100
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -946,6 +1091,24 @@ define amdgpu_kernel void @v_insertelement_v4bf16_dynamic_sgpr(ptr addrspace(1)
 ; GFX942-NEXT:    v_bfi_b32 v0, s2, v4, v0
 ; GFX942-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v4bf16_dynamic_sgpr:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b64 v[0:1], v2, s[2:3] scale_offset
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_lshl_b32 s2, s5, 4
+; GFX1250-NEXT:    s_pack_ll_b32_b16 s4, s4, s4
+; GFX1250-NEXT:    s_lshl_b64 s[2:3], 0xffff, s2
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_bfi_b32 v1, s3, s4, v1
+; GFX1250-NEXT:    v_bfi_b32 v0, s2, s4, v0
+; GFX1250-NEXT:    global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <4 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1028,6 +1191,19 @@ define amdgpu_kernel void @v_insertelement_v8bf16_3(ptr addrspace(1) %out, ptr a
 ; GFX942-NEXT:    v_perm_b32 v1, s6, v1, v5
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v8bf16_3:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_load_b32 s4, s[4:5], 0x10
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b128 v[0:3], v4, s[2:3] scale_offset
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_perm_b32 v1, s4, v1, 0x5040100
+; GFX1250-NEXT:    global_store_b128 v4, v[0:3], s[0:1] scale_offset
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1244,6 +1420,50 @@ define amdgpu_kernel void @v_insertelement_v8bf16_dynamic(ptr addrspace(1) %out,
 ; GFX942-NEXT:    v_perm_b32 v0, v0, v9, s14
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[16:17]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v8bf16_dynamic:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    global_load_b128 v[0:3], v4, s[2:3] scale_offset
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 6
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 7
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v5, v3, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 4
+; GFX1250-NEXT:    v_dual_lshrrev_b32 v3, 16, v3 :: v_dual_lshrrev_b32 v6, 16, v2
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 5
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 2
+; GFX1250-NEXT:    v_dual_lshrrev_b32 v7, 16, v1 :: v_dual_lshrrev_b32 v8, 16, v0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v3, v3, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 3
+; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v1, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v7, v7, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 1
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_cndmask_b32_e64 v6, v6, s4, s3
+; GFX1250-NEXT:    v_cndmask_b32_e64 v8, v8, s4, s2
+; GFX1250-NEXT:    v_perm_b32 v3, v3, v5, 0x5040100
+; GFX1250-NEXT:    v_perm_b32 v1, v7, v1, 0x5040100
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_perm_b32 v2, v6, v2, 0x5040100
+; GFX1250-NEXT:    v_perm_b32 v0, v8, v0, 0x5040100
+; GFX1250-NEXT:    global_store_b128 v4, v[0:3], s[0:1] scale_offset
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <8 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1342,6 +1562,26 @@ define amdgpu_kernel void @v_insertelement_v16bf16_3(ptr addrspace(1) %out, ptr
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[6:9], s[0:1] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v4, v[0:3], s[0:1]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v16bf16_3:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_load_b32 s4, s[4:5], 0x10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_load_b128 v[0:3], v8, s[2:3]
+; GFX1250-NEXT:    global_load_b128 v[4:7], v8, s[2:3] offset:16
+; GFX1250-NEXT:    s_wait_loadcnt 0x1
+; GFX1250-NEXT:    v_perm_b32 v1, s4, v1, 0x5040100
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX1250-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
@@ -1715,6 +1955,87 @@ define amdgpu_kernel void @v_insertelement_v16bf16_dynamic(ptr addrspace(1) %out
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[4:7], s[36:37] offset:16
 ; GFX942-NEXT:    global_store_dwordx4 v8, v[0:3], s[36:37]
 ; GFX942-NEXT:    s_endpgm
+;
+; GFX1250-LABEL: v_insertelement_v16bf16_dynamic:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
+; GFX1250-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_load_b64 s[4:5], s[4:5], 0x10
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT:    v_lshlrev_b32_e32 v8, 5, v0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_load_b128 v[0:3], v8, s[2:3]
+; GFX1250-NEXT:    global_load_b128 v[4:7], v8, s[2:3] offset:16
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 6
+; GFX1250-NEXT:    s_wait_xcnt 0x0
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 7
+; GFX1250-NEXT:    s_wait_loadcnt 0x1
+; GFX1250-NEXT:    v_cndmask_b32_e64 v9, v3, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 4
+; GFX1250-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 5
+; GFX1250-NEXT:    v_dual_lshrrev_b32 v10, 16, v2 :: v_dual_lshrrev_b32 v11, 16, v1
+; GFX1250-NEXT:    v_cndmask_b32_e64 v2, v2, s4, s3
+; GFX1250-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 2
+; GFX1250-NEXT:    v_cndmask_b32_e64 v3, v3, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 3
+; GFX1250-NEXT:    v_cndmask_b32_e64 v1, v1, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 0
+; GFX1250-NEXT:    s_wait_loadcnt 0x0
+; GFX1250-NEXT:    v_dual_lshrrev_b32 v12, 16, v0 :: v_dual_lshrrev_b32 v13, 16, v7
+; GFX1250-NEXT:    v_cndmask_b32_e64 v11, v11, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 1
+; GFX1250-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 14
+; GFX1250-NEXT:    v_cndmask_b32_e64 v10, v10, s4, s3
+; GFX1250-NEXT:    v_perm_b32 v3, v3, v9, 0x5040100
+; GFX1250-NEXT:    v_cndmask_b32_e64 v9, v12, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 15
+; GFX1250-NEXT:    v_cndmask_b32_e64 v7, v7, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 12
+; GFX1250-NEXT:    v_dual_lshrrev_b32 v14, 16, v6 :: v_dual_lshrrev_b32 v15, 16, v5
+; GFX1250-NEXT:    v_perm_b32 v2, v10, v2, 0x5040100
+; GFX1250-NEXT:    v_cndmask_b32_e64 v10, v13, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 13
+; GFX1250-NEXT:    v_cndmask_b32_e64 v6, v6, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 10
+; GFX1250-NEXT:    v_cndmask_b32_e64 v12, v14, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 11
+; GFX1250-NEXT:    v_cndmask_b32_e64 v5, v5, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 8
+; GFX1250-NEXT:    v_lshrrev_b32_e32 v16, 16, v4
+; GFX1250-NEXT:    v_cndmask_b32_e64 v13, v15, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    s_cmp_eq_u32 s5, 9
+; GFX1250-NEXT:    v_cndmask_b32_e64 v4, v4, s4, s2
+; GFX1250-NEXT:    s_cselect_b32 s2, -1, 0
+; GFX1250-NEXT:    v_perm_b32 v7, v10, v7, 0x5040100
+; GFX1250-NEXT:    v_cndmask_b32_e64 v14, v16, s4, s2
+; GFX1250-NEXT:    v_perm_b32 v6, v12, v6, 0x5040100
+; GFX1250-NEXT:    v_perm_b32 v5, v13, v5, 0x5040100
+; GFX1250-NEXT:    v_perm_b32 v1, v11, v1, 0x5040100
+; GFX1250-NEXT:    v_perm_b32 v0, v9, v0, 0x5040100
+; GFX1250-NEXT:    v_perm_b32 v4, v14, v4, 0x5040100
+; GFX1250-NEXT:    s_clause 0x1
+; GFX1250-NEXT:    global_store_b128 v8, v[4:7], s[0:1] offset:16
+; GFX1250-NEXT:    global_store_b128 v8, v[0:3], s[0:1]
+; GFX1250-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <16 x bfloat>, ptr addrspace(1) %in, i64 %tid.ext
diff --git a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
index 6e92677a68f06..247a0a9a64b33 100644
--- a/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
+++ b/llvm/test/CodeGen/AMDGPU/lshl-add-u64.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck -check-prefixes=GCN,GFX942 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250 %s
 
 define i64 @lshl_add_u64_v1v(i64 %v, i64 %a) {
 ; GCN-LABEL: lshl_add_u64_v1v:
@@ -19,7 +20,9 @@ define i64 @lshl_add_u64_v4v(i64 %v, i64 %a) {
 define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) {
 ; GCN-LABEL: lshl_add_u64_v5v:
 ; GCN:      v_lshlrev_b64
-; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
+; GFX942-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
+; GFX1250-NEXT: s_delay_alu
+; GFX1250-NEXT: v_add_nc_u64_e32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
   %shl = shl i64 %v, 5
   %add = add i64 %shl, %a
   ret i64 %add
@@ -28,7 +31,9 @@ define i64 @lshl_add_u64_v5v(i64 %v, i64 %a) {
 define i64 @lshl_add_u64_vvv(i64 %v, i64 %s, i64 %a) {
 ; GCN-LABEL: lshl_add_u64_vvv:
 ; GCN:      v_lshlrev_b64
-; GCN-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
+; GFX942-NEXT: v_lshl_add_u64 v[{{[0-9:]+}}], v[{{[0-9:]+}}], 0, v[{{[0-9:]+}}]
+; GFX1250-NEXT: s_delay_alu
+; GFX1250-NEXT: v_add_nc_u64_e32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
   %shl = shl i64 %v, %s
   %add = add i64 %shl, %a
   ret i64 %add
@@ -57,8 +62,9 @@ define amdgpu_kernel void @lshl_add_u64_v2s(i64 %a) {
 define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) {
 ; GCN-LABEL: lshl_add_u64_s2s:
 ; GCN:    s_lshl_b64
-; GCN:    s_add_u32
-; GCN:    s_addc_u32
+; GFX942: s_add_u32
+; GFX942: s_addc_u32
+; GFX1250: s_add_nc_u64
   %shl = shl i64 %v, 2
   %add = add i64 %shl, %a
   store i64 %add, ptr poison
@@ -67,14 +73,16 @@ define amdgpu_kernel void @lshl_add_u64_s2s(i64 %v, i64 %a) {
 
 define i64 @add_u64_vv(i64 %v, i64 %a) {
 ; GCN-LABEL: add_u64_vv:
-; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX1250: v_add_nc_u64_e32 v[0:1], v[0:1], v[2:3]
   %add = add i64 %v, %a
   ret i64 %add
 }
 
 define amdgpu_kernel void @add_u64_sv(i64 %v) {
 ; GCN-LABEL: add_u64_sv:
-; GCN: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX1250: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
   %a = load i64, ptr poison
   %add = add i64 %v, %a
   store i64 %add, ptr poison
@@ -83,7 +91,8 @@ define amdgpu_kernel void @add_u64_sv(i64 %v) {
 
 define amdgpu_kernel void @add_u64_vs(i64 %a) {
 ; GCN-LABEL: add_u64_vs:
-; GCN: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX942: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
+; GFX1250: v_add_nc_u64_e32 v[0:1], s[0:1], v[0:1]
   %v = load i64, ptr poison
   %add = add i64 %v, %a
   store i64 %add, ptr poison
@@ -92,8 +101,9 @@ define amdgpu_kernel void @add_u64_vs(i64 %a) {
 
 define amdgpu_kernel void @add_u64_ss(i64 %v, i64 %a) {
 ; GCN-LABEL: add_u64_ss:
-; GCN: s_add_u32
-; GCN: s_addc_u32 s1, s1, s3
+; GFX942: s_add_u32
+; GFX942: s_addc_u32 s1, s1, s3
+; GFX1250: s_add_nc_u64 s[0:1], s[0:1], s[2:3]
   %add = add i64 %v, %a
   store i64 %add, ptr poison
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll
index 27ecc837ea732..b13fd0deb66e9 100644
--- a/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll
+++ b/llvm/test/CodeGen/AMDGPU/scale-offset-scratch.ll
@@ -284,7 +284,7 @@ define amdgpu_ps void @scratch_store_b32_idxprom(ptr addrspace(5) align 4 inreg
 ; GCN-LABEL: scratch_store_b32_idxprom:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1.0
-; GCN-NEXT:    scratch_store_b32 v0, v1, s0 scale_offset
+; GCN-NEXT:    scratch_store_b32 v0, v1, s0 scale_offset scope:SCOPE_SE
 ; GCN-NEXT:    s_endpgm
 entry:
   %idxprom = zext i32 %idx to i64
@@ -297,7 +297,7 @@ define amdgpu_ps void @scratch_store_b16_idxprom(ptr addrspace(5) align 2 inreg
 ; GCN-LABEL: scratch_store_b16_idxprom:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    v_mov_b32_e32 v1, 1
-; GCN-NEXT:    scratch_store_b16 v0, v1, s0 scale_offset
+; GCN-NEXT:    scratch_store_b16 v0, v1, s0 scale_offset scope:SCOPE_SE
 ; GCN-NEXT:    s_endpgm
 entry:
   %idxprom = zext i32 %idx to i64
@@ -310,7 +310,7 @@ define amdgpu_ps void @scratch_store_b64_idxprom(ptr addrspace(5) align 4 inreg
 ; GCN-LABEL: scratch_store_b64_idxprom:
 ; GCN:       ; %bb.0: ; %entry
 ; GCN-NEXT:    v_mov_b64_e32 v[2:3], 1.0
-; GCN-NEXT:    scratch_store_b64 v0, v[2:3], s0 scale_offset
+; GCN-NEXT:    scratch_store_b64 v0, v[2:3], s0 scale_offset scope:SCOPE_SE
 ; GCN-NEXT:    s_endpgm
 entry:
   %idxprom = zext i32 %idx to i64



More information about the llvm-commits mailing list