[llvm] efd8143 - [AMDGPU] gfx1250 codegen load tests update. NFC. (#155305)
via llvm-commits
llvm-commits at lists.llvm.org
Mon Aug 25 15:09:44 PDT 2025
Author: Stanislav Mekhanoshin
Date: 2025-08-25T15:09:41-07:00
New Revision: efd8143c24dcd3cff6d325d875fdf3359b3b9ca9
URL: https://github.com/llvm/llvm-project/commit/efd8143c24dcd3cff6d325d875fdf3359b3b9ca9
DIFF: https://github.com/llvm/llvm-project/commit/efd8143c24dcd3cff6d325d875fdf3359b3b9ca9.diff
LOG: [AMDGPU] gfx1250 codegen load tests update. NFC. (#155305)
Added:
Modified:
llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
Removed:
################################################################################
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
index 6bb104311a4d8..ab8d8c192187f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-UNALIGNED %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX12-NOUNALIGNED %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX1250,GFX1250-UNALIGNED %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1250 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GFX12,GFX1250,GFX1250-NOUNALIGNED %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-UNALIGNED %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=-unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX9,GFX9-NOUNALIGNED %s
; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=hawaii -mattr=+unaligned-access-mode < %s | FileCheck -check-prefixes=GCN,GFX7,GFX7-UNALIGNED %s
@@ -64,6 +66,52 @@ define <3 x i32> @v_load_constant_v3i32_align1(ptr addrspace(4) %ptr) {
; GFX12-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v7
; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1250-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
+; GFX1250-UNALIGNED: ; %bb.0:
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-UNALIGNED-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1:
+; GFX1250-NOUNALIGNED: ; %bb.0:
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_clause 0xb
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v2, v[0:1], off
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v3, v[0:1], off offset:1
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v4, v[0:1], off offset:2
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v5, v[0:1], off offset:3
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v6, v[0:1], off offset:4
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v7, v[0:1], off offset:5
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v8, v[0:1], off offset:6
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v9, v[0:1], off offset:7
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v10, v[0:1], off offset:8
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v11, v[0:1], off offset:9
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v12, v[0:1], off offset:11
+; GFX1250-NOUNALIGNED-NEXT: global_load_u8 v0, v[0:1], off offset:10
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0xa
+; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v3, 8, v2
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x8
+; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v3, 16, v4 :: v_dual_lshlrev_b32 v2, 24, v5
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x6
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v4, v7, 8, v6
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4
+; GFX1250-NOUNALIGNED-NEXT: v_dual_lshlrev_b32 v6, 16, v8 :: v_dual_lshlrev_b32 v5, 24, v9
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v7, v11, 8, v10
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x1
+; GFX1250-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 24, v12
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v0
+; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v1
+; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v4
+; GFX1250-NOUNALIGNED-NEXT: s_delay_alu instid0(VALU_DEP_3)
+; GFX1250-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v7
+; GFX1250-NOUNALIGNED-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align1:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -256,6 +304,34 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
; GFX12-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v7, 16, v6
; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
;
+; GFX1250-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
+; GFX1250-UNALIGNED: ; %bb.0:
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-UNALIGNED-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX1250-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2:
+; GFX1250-NOUNALIGNED: ; %bb.0:
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_clause 0x5
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v2, v[0:1], off
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v3, v[0:1], off offset:2
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v4, v[0:1], off offset:4
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v5, v[0:1], off offset:6
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v6, v[0:1], off offset:8
+; GFX1250-NOUNALIGNED-NEXT: global_load_u16 v7, v[0:1], off offset:10
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x4
+; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v0, v3, 16, v2
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x2
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v1, v5, 16, v4
+; GFX1250-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: v_lshl_or_b32 v2, v7, 16, v6
+; GFX1250-NOUNALIGNED-NEXT: s_set_pc_i64 s[30:31]
+;
; GFX9-UNALIGNED-LABEL: v_load_constant_v3i32_align2:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -346,16 +422,35 @@ define <3 x i32> @v_load_constant_v3i32_align2(ptr addrspace(4) %ptr) {
}
define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_v3i32_align4:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align4:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align4:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_v3i32_align4:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align4:
; GFX9: ; %bb.0:
@@ -392,16 +487,35 @@ define <3 x i32> @v_load_constant_v3i32_align4(ptr addrspace(4) %ptr) {
}
define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_i96_align8:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_i96_align8:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_i96_align8:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_i96_align8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_i96_align8:
; GFX9: ; %bb.0:
@@ -438,16 +552,35 @@ define i96 @v_load_constant_i96_align8(ptr addrspace(4) %ptr) {
}
define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_v3i32_align8:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align8:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align8:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_v3i32_align8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align8:
; GFX9: ; %bb.0:
@@ -484,16 +617,35 @@ define <3 x i32> @v_load_constant_v3i32_align8(ptr addrspace(4) %ptr) {
}
define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_v6i16_align8:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_v6i16_align8:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v6i16_align8:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_v6i16_align8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v6i16_align8:
; GFX9: ; %bb.0:
@@ -539,28 +691,67 @@ define <6 x i16> @v_load_constant_v6i16_align8(ptr addrspace(4) %ptr) {
}
define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_v12i8_align8:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: v_lshrrev_b32_e32 v13, 8, v0
-; GFX12-NEXT: v_lshrrev_b32_e32 v12, 16, v0
-; GFX12-NEXT: v_lshrrev_b32_e32 v3, 24, v0
-; GFX12-NEXT: v_lshrrev_b32_e32 v5, 8, v1
-; GFX12-NEXT: v_lshrrev_b32_e32 v6, 16, v1
-; GFX12-NEXT: v_lshrrev_b32_e32 v7, 24, v1
-; GFX12-NEXT: v_lshrrev_b32_e32 v9, 8, v2
-; GFX12-NEXT: v_lshrrev_b32_e32 v10, 16, v2
-; GFX12-NEXT: v_lshrrev_b32_e32 v11, 24, v2
-; GFX12-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13
-; GFX12-NEXT: v_mov_b32_e32 v8, v2
-; GFX12-NEXT: v_mov_b32_e32 v2, v12
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_v12i8_align8:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v0
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX12-UNALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v2
+; GFX12-UNALIGNED-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13
+; GFX12-UNALIGNED-NEXT: v_mov_b32_e32 v8, v2
+; GFX12-UNALIGNED-NEXT: v_mov_b32_e32 v2, v12
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v12i8_align8:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v13, 8, v0
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v12, 16, v0
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v3, 24, v0
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v5, 8, v1
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v6, 16, v1
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v7, 24, v1
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v9, 8, v2
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v10, 16, v2
+; GFX12-NOUNALIGNED-NEXT: v_lshrrev_b32_e32 v11, 24, v2
+; GFX12-NOUNALIGNED-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v13
+; GFX12-NOUNALIGNED-NEXT: v_mov_b32_e32 v8, v2
+; GFX12-NOUNALIGNED-NEXT: v_mov_b32_e32 v2, v12
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_v12i8_align8:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_dual_lshrrev_b32 v13, 8, v0 :: v_dual_lshrrev_b32 v12, 16, v0
+; GFX1250-NEXT: v_dual_lshrrev_b32 v3, 24, v0 :: v_dual_lshrrev_b32 v5, 8, v1
+; GFX1250-NEXT: v_dual_lshrrev_b32 v6, 16, v1 :: v_dual_lshrrev_b32 v7, 24, v1
+; GFX1250-NEXT: v_dual_lshrrev_b32 v9, 8, v2 :: v_dual_lshrrev_b32 v10, 16, v2
+; GFX1250-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_lshrrev_b32 v11, 24, v2
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, v2 :: v_dual_mov_b32 v1, v13
+; GFX1250-NEXT: v_mov_b32_e32 v2, v12
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v12i8_align8:
; GFX9: ; %bb.0:
@@ -632,16 +823,35 @@ define <12 x i8> @v_load_constant_v12i8_align8(ptr addrspace(4) %ptr) {
}
define <3 x i32> @v_load_constant_v3i32_align16(ptr addrspace(4) %ptr) {
-; GFX12-LABEL: v_load_constant_v3i32_align16:
-; GFX12: ; %bb.0:
-; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX12-NEXT: s_wait_expcnt 0x0
-; GFX12-NEXT: s_wait_samplecnt 0x0
-; GFX12-NEXT: s_wait_bvhcnt 0x0
-; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: global_load_b96 v[0:2], v[0:1], off
-; GFX12-NEXT: s_wait_loadcnt 0x0
-; GFX12-NEXT: s_setpc_b64 s[30:31]
+; GFX12-UNALIGNED-LABEL: v_load_constant_v3i32_align16:
+; GFX12-UNALIGNED: ; %bb.0:
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-UNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-UNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX12-NOUNALIGNED-LABEL: v_load_constant_v3i32_align16:
+; GFX12-NOUNALIGNED: ; %bb.0:
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_expcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_samplecnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_bvhcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX12-NOUNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX12-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX1250-LABEL: v_load_constant_v3i32_align16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b96 v[0:2], v[0:1], off
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
;
; GFX9-LABEL: v_load_constant_v3i32_align16:
; GFX9: ; %bb.0:
@@ -720,6 +930,53 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(ptr addrspace(4) inreg
; GFX12-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s5
; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
;
+; GFX1250-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
+; GFX1250-UNALIGNED: ; %bb.0:
+; GFX1250-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1]
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
+; GFX1250-UNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1:
+; GFX1250-NOUNALIGNED: ; %bb.0:
+; GFX1250-NOUNALIGNED-NEXT: s_clause 0xa
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s2, s[0:1], 0x1
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s3, s[0:1], 0x3
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s4, s[0:1], 0x2
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s5, s[0:1], 0x5
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s6, s[0:1], 0x7
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s7, s[0:1], 0x6
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s8, s[0:1], 0x9
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s9, s[0:1], 0xb
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s10, s[0:1], 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s11, s[0:1], 0x4
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s12, s[0:1], 0xa
+; GFX1250-NOUNALIGNED-NEXT: s_wait_xcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_load_u8 s1, s[0:1], 0x8
+; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s0, s2, 8
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s3, 24
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s3, s4, 16
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s4, s5, 8
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s3
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s5, s6, 24
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s6, s7, 16
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s7, s8, 8
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s0, s10
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s8, s9, 24
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s2, s0
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s12, 16
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s3, s4, s11
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s4, s5, s6
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s5, s7, s1
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s8, s2
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s4, s3
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s5
+; GFX1250-NOUNALIGNED-NEXT: ; return to shader part epilog
+;
; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
@@ -916,6 +1173,34 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(ptr addrspace(4) inreg
; GFX12-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s7
; GFX12-NOUNALIGNED-NEXT: ; return to shader part epilog
;
+; GFX1250-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
+; GFX1250-UNALIGNED: ; %bb.0:
+; GFX1250-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-UNALIGNED-NEXT: global_load_b96 v[0:2], v0, s[0:1]
+; GFX1250-UNALIGNED-NEXT: s_wait_loadcnt 0x0
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
+; GFX1250-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
+; GFX1250-UNALIGNED-NEXT: ; return to shader part epilog
+;
+; GFX1250-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2:
+; GFX1250-NOUNALIGNED: ; %bb.0:
+; GFX1250-NOUNALIGNED-NEXT: s_clause 0x5
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s2, s[0:1], 0x2
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s3, s[0:1], 0x6
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s4, s[0:1], 0xa
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s5, s[0:1], 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s6, s[0:1], 0x4
+; GFX1250-NOUNALIGNED-NEXT: s_load_u16 s7, s[0:1], 0x8
+; GFX1250-NOUNALIGNED-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s0, s2, 16
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s1, s3, 16
+; GFX1250-NOUNALIGNED-NEXT: s_lshl_b32 s2, s4, 16
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s0, s0, s5
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s1, s1, s6
+; GFX1250-NOUNALIGNED-NEXT: s_or_b32 s2, s2, s7
+; GFX1250-NOUNALIGNED-NEXT: ; return to shader part epilog
+;
; GFX9-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
; GFX9-UNALIGNED: ; %bb.0:
; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
index 1b64099d6bf51..e448c4cba0941 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-load.mir
@@ -1,6 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=GCN,GFX7
# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -verify-machineinstrs -o - | FileCheck %s -check-prefixes=GCN,GFX12
+# RUN: llc -amdgpu-global-isel-new-legality -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck %s -check-prefixes=GCN,GFX12
--- |
define amdgpu_kernel void @load_global_v8i32_non_uniform(ptr addrspace(1) %in) {
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
index 997ac804f710d..b2ff0995ce578 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-split-scalar-load-metadata.mir
@@ -1,6 +1,7 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck -check-prefix=GFX7 %s
# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck -check-prefix=GFX12 %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 -run-pass="amdgpu-regbankselect,amdgpu-regbanklegalize" %s -o - | FileCheck -check-prefix=GFX12 %s
--- |
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
index d59f72ad7a1ac..5b2213592f495 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -3,6 +3,7 @@
; RUN: llc -mtriple=amdgcn-- -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -mtriple=r600-- -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace(4) nocapture %in) #0 {
; GFX6-LABEL: constant_load_i1:
@@ -74,6 +75,18 @@ define amdgpu_kernel void @constant_load_i1(ptr addrspace(1) %out, ptr addrspace
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_and_b32 s2, s2, 1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load i1, ptr addrspace(4) %in
store i1 %load, ptr addrspace(1) %out
ret void
@@ -145,6 +158,16 @@ define amdgpu_kernel void @constant_load_v2i1(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v2i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <2 x i1>, ptr addrspace(4) %in
store <2 x i1> %load, ptr addrspace(1) %out
ret void
@@ -215,6 +238,16 @@ define amdgpu_kernel void @constant_load_v3i1(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v3i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <3 x i1>, ptr addrspace(4) %in
store <3 x i1> %load, ptr addrspace(1) %out
ret void
@@ -286,6 +319,16 @@ define amdgpu_kernel void @constant_load_v4i1(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v4i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <4 x i1>, ptr addrspace(4) %in
store <4 x i1> %load, ptr addrspace(1) %out
ret void
@@ -357,6 +400,16 @@ define amdgpu_kernel void @constant_load_v8i1(ptr addrspace(1) %out, ptr addrspa
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b8 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v8i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b8 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <8 x i1>, ptr addrspace(4) %in
store <8 x i1> %load, ptr addrspace(1) %out
ret void
@@ -428,6 +481,16 @@ define amdgpu_kernel void @constant_load_v16i1(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b16 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v16i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v0, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u16 v1, v0, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: global_store_b16 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <16 x i1>, ptr addrspace(4) %in
store <16 x i1> %load, ptr addrspace(1) %out
ret void
@@ -483,6 +546,16 @@ define amdgpu_kernel void @constant_load_v32i1(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v32i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <32 x i1>, ptr addrspace(4) %in
store <32 x i1> %load, ptr addrspace(1) %out
ret void
@@ -541,6 +614,17 @@ define amdgpu_kernel void @constant_load_v64i1(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v64i1:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
store <64 x i1> %load, ptr addrspace(1) %out
ret void
@@ -602,6 +686,16 @@ define amdgpu_kernel void @constant_zextload_i1_to_i32(ptr addrspace(1) %out, pt
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_i1_to_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%a = load i1, ptr addrspace(4) %in
%ext = zext i1 %a to i32
store i32 %ext, ptr addrspace(1) %out
@@ -669,6 +763,18 @@ define amdgpu_kernel void @constant_sextload_i1_to_i32(ptr addrspace(1) %out, pt
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_i1_to_i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%a = load i1, ptr addrspace(4) %in
%ext = sext i1 %a to i32
store i32 %ext, ptr addrspace(1) %out
@@ -731,6 +837,16 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v1i1_to_v1i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <1 x i1>, ptr addrspace(4) %in
%ext = zext <1 x i1> %load to <1 x i32>
store <1 x i32> %ext, ptr addrspace(1) %out
@@ -798,6 +914,18 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v1i1_to_v1i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x10000
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <1 x i1>, ptr addrspace(4) %in
%ext = sext <1 x i1> %load to <1 x i32>
store <1 x i32> %ext, ptr addrspace(1) %out
@@ -871,6 +999,19 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(ptr addrspace(1) %out
; GFX12-NEXT: v_lshrrev_b32_e32 v1, 1, v1
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v2i1_to_v2i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v2, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_lshrrev_b32 v1, 1, v1 :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <2 x i1>, ptr addrspace(4) %in
%ext = zext <2 x i1> %load to <2 x i32>
store <2 x i32> %ext, ptr addrspace(1) %out
@@ -944,6 +1085,19 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(ptr addrspace(1) %out
; GFX12-NEXT: v_mov_b32_e32 v0, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v2i1_to_v2i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i32 s3, s2, 0x10000
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x10001
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s3
+; GFX1250-NEXT: v_mov_b32_e32 v1, s2
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <2 x i1>, ptr addrspace(4) %in
%ext = sext <2 x i1> %load to <2 x i32>
store <2 x i32> %ext, ptr addrspace(1) %out
@@ -1027,6 +1181,21 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(ptr addrspace(1) %out
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v3i1_to_v3i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v3, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v1, v3, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_dual_lshrrev_b32 v2, 2, v0 :: v_dual_bitop2_b32 v0, 1, v1 bitop3:0x40
+; GFX1250-NEXT: v_bfe_u32 v1, v1, 1, 1
+; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <3 x i1>, ptr addrspace(4) %in
%ext = zext <3 x i1> %load to <3 x i32>
store <3 x i32> %ext, ptr addrspace(1) %out
@@ -1109,6 +1278,20 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v3i1_to_v3i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i32 s3, s2, 0x10002
+; GFX1250-NEXT: s_bfe_i32 s4, s2, 0x10000
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x10001
+; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
+; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <3 x i1>, ptr addrspace(4) %in
%ext = sext <3 x i1> %load to <3 x i32>
store <3 x i32> %ext, ptr addrspace(1) %out
@@ -1192,6 +1375,22 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(ptr addrspace(1) %out
; GFX12-NEXT: v_lshrrev_b32_e32 v3, 3, v3
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v4i1_to_v4i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v4, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v1, v4, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff, v1
+; GFX1250-NEXT: v_and_b32_e32 v0, 1, v1
+; GFX1250-NEXT: v_bfe_u32 v2, v1, 2, 1
+; GFX1250-NEXT: v_bfe_u32 v1, v1, 1, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1250-NEXT: v_lshrrev_b32_e32 v3, 3, v3
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <4 x i1>, ptr addrspace(4) %in
%ext = zext <4 x i1> %load to <4 x i32>
store <4 x i32> %ext, ptr addrspace(1) %out
@@ -1278,6 +1477,22 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(ptr addrspace(1) %out
; GFX12-NEXT: v_mov_b32_e32 v2, s4
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v4i1_to_v4i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_i32 s4, s2, 0x10002
+; GFX1250-NEXT: s_bfe_i32 s5, s2, 0x10000
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x10001
+; GFX1250-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v0, s5
+; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: v_mov_b32_e32 v3, s3
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <4 x i1>, ptr addrspace(4) %in
%ext = sext <4 x i1> %load to <4 x i32>
store <4 x i32> %ext, ptr addrspace(1) %out
@@ -1403,6 +1618,31 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v8i1_to_v8i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v8, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v8, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10001
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10005
+; GFX1250-NEXT: s_and_b32 s6, s2, 1
+; GFX1250-NEXT: s_bfe_u32 s7, s2, 0x10002
+; GFX1250-NEXT: s_bfe_u32 s2, s2, 0x10004
+; GFX1250-NEXT: v_lshrrev_b32_e32 v3, 7, v0
+; GFX1250-NEXT: v_bfe_u32 v2, v0, 6, 1
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s4
+; GFX1250-NEXT: v_dual_mov_b32 v6, s7 :: v_dual_mov_b32 v7, s3
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <8 x i1>, ptr addrspace(4) %in
%ext = zext <8 x i1> %load to <8 x i32>
store <8 x i32> %ext, ptr addrspace(1) %out
@@ -1524,6 +1764,30 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(ptr addrspace(1) %out
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v8i1_to_v8i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_i32 s4, s2, 0x10002
+; GFX1250-NEXT: s_bfe_i32 s5, s2, 0x10001
+; GFX1250-NEXT: s_bfe_i32 s6, s2, 0x10000
+; GFX1250-NEXT: s_bfe_i32 s7, s2, 0x10007
+; GFX1250-NEXT: s_bfe_i32 s8, s2, 0x10006
+; GFX1250-NEXT: s_bfe_i32 s9, s2, 0x10004
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x10005
+; GFX1250-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s9
+; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s8
+; GFX1250-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v4, s6
+; GFX1250-NEXT: v_dual_mov_b32 v5, s5 :: v_dual_mov_b32 v6, s4
+; GFX1250-NEXT: v_mov_b32_e32 v7, s3
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <8 x i1>, ptr addrspace(4) %in
%ext = sext <8 x i1> %load to <8 x i32>
store <8 x i32> %ext, ptr addrspace(1) %out
@@ -1722,6 +1986,46 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v16i1_to_v16i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v16, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u16 v0, v16, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1250-NEXT: s_and_b32 s6, 0xffff, s2
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10001
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10007
+; GFX1250-NEXT: s_bfe_u32 s7, s2, 0x10009
+; GFX1250-NEXT: s_bfe_u32 s8, s2, 0x1000d
+; GFX1250-NEXT: s_and_b32 s9, s2, 1
+; GFX1250-NEXT: s_bfe_u32 s10, s2, 0x1000a
+; GFX1250-NEXT: s_bfe_u32 s2, s2, 0x1000c
+; GFX1250-NEXT: s_bfe_u32 s11, s6, 0x10005
+; GFX1250-NEXT: s_bfe_u32 s12, s6, 0x1000b
+; GFX1250-NEXT: s_lshr_b32 s13, s6, 15
+; GFX1250-NEXT: s_bfe_u32 s14, s6, 0x10002
+; GFX1250-NEXT: s_bfe_u32 s15, s6, 0x10006
+; GFX1250-NEXT: s_bfe_u32 s16, s6, 0x10004
+; GFX1250-NEXT: s_bfe_u32 s17, s6, 0x10008
+; GFX1250-NEXT: s_bfe_u32 s6, s6, 0x1000e
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s8
+; GFX1250-NEXT: v_dual_mov_b32 v5, s7 :: v_dual_mov_b32 v15, s3
+; GFX1250-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s13
+; GFX1250-NEXT: v_dual_mov_b32 v4, s17 :: v_dual_mov_b32 v6, s10
+; GFX1250-NEXT: v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v7, s12
+; GFX1250-NEXT: v_dual_mov_b32 v8, s16 :: v_dual_mov_b32 v9, s11
+; GFX1250-NEXT: v_dual_mov_b32 v10, s15 :: v_dual_mov_b32 v12, s9
+; GFX1250-NEXT: v_dual_mov_b32 v13, s4 :: v_dual_mov_b32 v14, s14
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <16 x i1>, ptr addrspace(4) %in
%ext = zext <16 x i1> %load to <16 x i32>
store <16 x i32> %ext, ptr addrspace(1) %out
@@ -1915,6 +2219,44 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v16i1_to_v16i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u16 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_i32 s4, s2, 0x10002
+; GFX1250-NEXT: s_bfe_i32 s5, s2, 0x10001
+; GFX1250-NEXT: s_bfe_i32 s6, s2, 0x10000
+; GFX1250-NEXT: s_bfe_i32 s7, s2, 0x10007
+; GFX1250-NEXT: s_bfe_i32 s8, s2, 0x10006
+; GFX1250-NEXT: s_bfe_i32 s9, s2, 0x10005
+; GFX1250-NEXT: s_bfe_i32 s10, s2, 0x10004
+; GFX1250-NEXT: s_bfe_i32 s11, s2, 0x1000b
+; GFX1250-NEXT: s_bfe_i32 s12, s2, 0x1000a
+; GFX1250-NEXT: s_bfe_i32 s13, s2, 0x10009
+; GFX1250-NEXT: s_bfe_i32 s14, s2, 0x10008
+; GFX1250-NEXT: s_bfe_i32 s15, s2, 0x1000f
+; GFX1250-NEXT: s_bfe_i32 s16, s2, 0x1000e
+; GFX1250-NEXT: s_bfe_i32 s17, s2, 0x1000c
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x1000d
+; GFX1250-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v0, s17
+; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s16
+; GFX1250-NEXT: v_dual_mov_b32 v3, s15 :: v_dual_mov_b32 v4, s14
+; GFX1250-NEXT: v_dual_mov_b32 v5, s13 :: v_dual_mov_b32 v6, s12
+; GFX1250-NEXT: v_dual_mov_b32 v7, s11 :: v_dual_mov_b32 v8, s10
+; GFX1250-NEXT: v_dual_mov_b32 v9, s9 :: v_dual_mov_b32 v10, s8
+; GFX1250-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v12, s6
+; GFX1250-NEXT: v_dual_mov_b32 v13, s5 :: v_dual_mov_b32 v14, s4
+; GFX1250-NEXT: v_mov_b32_e32 v15, s3
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <16 x i1>, ptr addrspace(4) %in
%ext = sext <16 x i1> %load to <16 x i32>
store <16 x i32> %ext, ptr addrspace(1) %out
@@ -2284,6 +2626,75 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v32i1_to_v32i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10001
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10007
+; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10005
+; GFX1250-NEXT: s_bfe_u32 s7, s2, 0x1000b
+; GFX1250-NEXT: s_bfe_u32 s8, s2, 0x10009
+; GFX1250-NEXT: s_bfe_u32 s9, s2, 0x1000f
+; GFX1250-NEXT: s_bfe_u32 s10, s2, 0x1000d
+; GFX1250-NEXT: s_bfe_u32 s11, s2, 0x10013
+; GFX1250-NEXT: s_bfe_u32 s12, s2, 0x10011
+; GFX1250-NEXT: s_bfe_u32 s13, s2, 0x10017
+; GFX1250-NEXT: s_bfe_u32 s14, s2, 0x1001b
+; GFX1250-NEXT: s_bfe_u32 s15, s2, 0x10019
+; GFX1250-NEXT: s_lshr_b32 s16, s2, 31
+; GFX1250-NEXT: s_bfe_u32 s17, s2, 0x1001d
+; GFX1250-NEXT: s_and_b32 s18, s2, 1
+; GFX1250-NEXT: s_bfe_u32 s19, s2, 0x10002
+; GFX1250-NEXT: s_bfe_u32 s20, s2, 0x10006
+; GFX1250-NEXT: s_bfe_u32 s21, s2, 0x10004
+; GFX1250-NEXT: s_bfe_u32 s22, s2, 0x1000a
+; GFX1250-NEXT: s_bfe_u32 s23, s2, 0x10008
+; GFX1250-NEXT: s_bfe_u32 s24, s2, 0x1000e
+; GFX1250-NEXT: s_bfe_u32 s25, s2, 0x1000c
+; GFX1250-NEXT: s_bfe_u32 s26, s2, 0x10012
+; GFX1250-NEXT: s_bfe_u32 s27, s2, 0x10010
+; GFX1250-NEXT: s_bfe_u32 s28, s2, 0x10016
+; GFX1250-NEXT: s_bfe_u32 s29, s2, 0x10015
+; GFX1250-NEXT: s_bfe_u32 s30, s2, 0x10014
+; GFX1250-NEXT: s_bfe_u32 s31, s2, 0x1001a
+; GFX1250-NEXT: s_bfe_u32 s33, s2, 0x10018
+; GFX1250-NEXT: s_bfe_u32 s34, s2, 0x1001c
+; GFX1250-NEXT: s_bfe_u32 s2, s2, 0x1001e
+; GFX1250-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v0, s34
+; GFX1250-NEXT: v_dual_mov_b32 v1, s17 :: v_dual_mov_b32 v2, s2
+; GFX1250-NEXT: v_dual_mov_b32 v3, s16 :: v_dual_mov_b32 v4, s33
+; GFX1250-NEXT: v_dual_mov_b32 v5, s15 :: v_dual_mov_b32 v6, s31
+; GFX1250-NEXT: v_dual_mov_b32 v7, s14 :: v_dual_mov_b32 v8, s30
+; GFX1250-NEXT: v_dual_mov_b32 v9, s29 :: v_dual_mov_b32 v10, s28
+; GFX1250-NEXT: v_mov_b32_e32 v11, s13
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v0, s27 :: v_dual_mov_b32 v1, s12
+; GFX1250-NEXT: v_dual_mov_b32 v2, s26 :: v_dual_mov_b32 v3, s11
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v4, s25 :: v_dual_mov_b32 v5, s10
+; GFX1250-NEXT: v_dual_mov_b32 v6, s24 :: v_dual_mov_b32 v7, s9
+; GFX1250-NEXT: v_dual_mov_b32 v12, s23 :: v_dual_mov_b32 v13, s8
+; GFX1250-NEXT: v_dual_mov_b32 v14, s22 :: v_dual_mov_b32 v15, s7
+; GFX1250-NEXT: v_dual_mov_b32 v16, s21 :: v_dual_mov_b32 v17, s6
+; GFX1250-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s5
+; GFX1250-NEXT: v_dual_mov_b32 v20, s18 :: v_dual_mov_b32 v21, s4
+; GFX1250-NEXT: v_dual_mov_b32 v22, s19 :: v_dual_mov_b32 v23, s3
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <32 x i1>, ptr addrspace(4) %in
%ext = zext <32 x i1> %load to <32 x i32>
store <32 x i32> %ext, ptr addrspace(1) %out
@@ -2686,6 +3097,75 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v32i1_to_v32i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_i32 s4, s2, 0x10002
+; GFX1250-NEXT: s_bfe_i32 s5, s2, 0x10001
+; GFX1250-NEXT: s_bfe_i32 s6, s2, 0x10000
+; GFX1250-NEXT: s_bfe_i32 s7, s2, 0x10007
+; GFX1250-NEXT: s_bfe_i32 s8, s2, 0x10006
+; GFX1250-NEXT: s_bfe_i32 s9, s2, 0x10005
+; GFX1250-NEXT: s_bfe_i32 s10, s2, 0x10004
+; GFX1250-NEXT: s_bfe_i32 s11, s2, 0x1000b
+; GFX1250-NEXT: s_bfe_i32 s12, s2, 0x1000a
+; GFX1250-NEXT: s_bfe_i32 s13, s2, 0x10009
+; GFX1250-NEXT: s_bfe_i32 s14, s2, 0x10008
+; GFX1250-NEXT: s_bfe_i32 s15, s2, 0x1000f
+; GFX1250-NEXT: s_bfe_i32 s16, s2, 0x1000e
+; GFX1250-NEXT: s_bfe_i32 s17, s2, 0x1000d
+; GFX1250-NEXT: s_bfe_i32 s18, s2, 0x1000c
+; GFX1250-NEXT: s_bfe_i32 s19, s2, 0x10013
+; GFX1250-NEXT: s_bfe_i32 s20, s2, 0x10012
+; GFX1250-NEXT: s_bfe_i32 s21, s2, 0x10011
+; GFX1250-NEXT: s_bfe_i32 s22, s2, 0x10010
+; GFX1250-NEXT: s_bfe_i32 s23, s2, 0x10017
+; GFX1250-NEXT: s_bfe_i32 s24, s2, 0x10016
+; GFX1250-NEXT: s_bfe_i32 s25, s2, 0x10015
+; GFX1250-NEXT: s_bfe_i32 s26, s2, 0x10014
+; GFX1250-NEXT: s_bfe_i32 s27, s2, 0x1001b
+; GFX1250-NEXT: s_bfe_i32 s28, s2, 0x1001a
+; GFX1250-NEXT: s_bfe_i32 s29, s2, 0x10019
+; GFX1250-NEXT: s_bfe_i32 s30, s2, 0x10018
+; GFX1250-NEXT: s_ashr_i32 s31, s2, 31
+; GFX1250-NEXT: s_bfe_i32 s33, s2, 0x1001e
+; GFX1250-NEXT: s_bfe_i32 s34, s2, 0x1001c
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x1001d
+; GFX1250-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v0, s34
+; GFX1250-NEXT: v_dual_mov_b32 v1, s2 :: v_dual_mov_b32 v2, s33
+; GFX1250-NEXT: v_dual_mov_b32 v3, s31 :: v_dual_mov_b32 v4, s30
+; GFX1250-NEXT: v_dual_mov_b32 v5, s29 :: v_dual_mov_b32 v6, s28
+; GFX1250-NEXT: v_dual_mov_b32 v7, s27 :: v_dual_mov_b32 v8, s26
+; GFX1250-NEXT: v_dual_mov_b32 v9, s25 :: v_dual_mov_b32 v10, s24
+; GFX1250-NEXT: v_mov_b32_e32 v11, s23
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s21
+; GFX1250-NEXT: v_dual_mov_b32 v2, s20 :: v_dual_mov_b32 v3, s19
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v4, s18 :: v_dual_mov_b32 v5, s17
+; GFX1250-NEXT: v_dual_mov_b32 v6, s16 :: v_dual_mov_b32 v7, s15
+; GFX1250-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v13, s13
+; GFX1250-NEXT: v_dual_mov_b32 v14, s12 :: v_dual_mov_b32 v15, s11
+; GFX1250-NEXT: v_dual_mov_b32 v16, s10 :: v_dual_mov_b32 v17, s9
+; GFX1250-NEXT: v_dual_mov_b32 v18, s8 :: v_dual_mov_b32 v19, s7
+; GFX1250-NEXT: v_dual_mov_b32 v20, s6 :: v_dual_mov_b32 v21, s5
+; GFX1250-NEXT: v_dual_mov_b32 v22, s4 :: v_dual_mov_b32 v23, s3
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <32 x i1>, ptr addrspace(4) %in
%ext = sext <32 x i1> %load to <32 x i32>
store <32 x i32> %ext, ptr addrspace(1) %out
@@ -3387,6 +3867,141 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v64i1_to_v64i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_lshr_b32 s33, s3, 31
+; GFX1250-NEXT: s_bfe_u32 s34, s3, 0x1001d
+; GFX1250-NEXT: s_bfe_u32 s65, s3, 0x1001c
+; GFX1250-NEXT: s_bfe_u32 s66, s3, 0x1001e
+; GFX1250-NEXT: s_bfe_u32 s30, s3, 0x1001b
+; GFX1250-NEXT: s_bfe_u32 s31, s3, 0x10019
+; GFX1250-NEXT: s_bfe_u32 s63, s3, 0x1001a
+; GFX1250-NEXT: s_bfe_u32 s64, s3, 0x10018
+; GFX1250-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v0, s65
+; GFX1250-NEXT: s_bfe_u32 s29, s3, 0x10017
+; GFX1250-NEXT: s_bfe_u32 s60, s3, 0x10016
+; GFX1250-NEXT: s_bfe_u32 s61, s3, 0x10015
+; GFX1250-NEXT: s_bfe_u32 s62, s3, 0x10014
+; GFX1250-NEXT: v_dual_mov_b32 v1, s34 :: v_dual_mov_b32 v2, s66
+; GFX1250-NEXT: v_dual_mov_b32 v3, s33 :: v_dual_mov_b32 v4, s64
+; GFX1250-NEXT: s_bfe_u32 s27, s3, 0x10013
+; GFX1250-NEXT: s_bfe_u32 s28, s3, 0x10011
+; GFX1250-NEXT: s_bfe_u32 s58, s3, 0x10012
+; GFX1250-NEXT: s_bfe_u32 s59, s3, 0x10010
+; GFX1250-NEXT: v_dual_mov_b32 v5, s31 :: v_dual_mov_b32 v6, s63
+; GFX1250-NEXT: v_dual_mov_b32 v7, s30 :: v_dual_mov_b32 v8, s62
+; GFX1250-NEXT: v_dual_mov_b32 v9, s61 :: v_dual_mov_b32 v10, s60
+; GFX1250-NEXT: v_dual_mov_b32 v11, s29 :: v_dual_mov_b32 v12, s59
+; GFX1250-NEXT: s_bfe_u32 s19, s3, 0x10003
+; GFX1250-NEXT: s_bfe_u32 s20, s3, 0x10001
+; GFX1250-NEXT: s_bfe_u32 s21, s3, 0x10007
+; GFX1250-NEXT: s_bfe_u32 s22, s3, 0x10005
+; GFX1250-NEXT: s_bfe_u32 s23, s3, 0x1000b
+; GFX1250-NEXT: s_bfe_u32 s24, s3, 0x10009
+; GFX1250-NEXT: s_bfe_u32 s25, s3, 0x1000f
+; GFX1250-NEXT: s_bfe_u32 s26, s3, 0x1000d
+; GFX1250-NEXT: s_and_b32 s51, s3, 1
+; GFX1250-NEXT: s_bfe_u32 s52, s3, 0x10002
+; GFX1250-NEXT: s_bfe_u32 s53, s3, 0x10006
+; GFX1250-NEXT: s_bfe_u32 s54, s3, 0x10004
+; GFX1250-NEXT: s_bfe_u32 s55, s3, 0x1000a
+; GFX1250-NEXT: s_bfe_u32 s56, s3, 0x10008
+; GFX1250-NEXT: s_bfe_u32 s57, s3, 0x1000e
+; GFX1250-NEXT: v_dual_mov_b32 v13, s28 :: v_dual_mov_b32 v14, s58
+; GFX1250-NEXT: s_bfe_u32 s3, s3, 0x1000c
+; GFX1250-NEXT: v_mov_b32_e32 v15, s27
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:240
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:224
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:208
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:192
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s26
+; GFX1250-NEXT: v_dual_mov_b32 v2, s57 :: v_dual_mov_b32 v3, s25
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v4, s56 :: v_dual_mov_b32 v5, s24
+; GFX1250-NEXT: v_dual_mov_b32 v6, s55 :: v_dual_mov_b32 v7, s23
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_mov_b32_e32 v8, s54
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10003
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10001
+; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10007
+; GFX1250-NEXT: s_bfe_u32 s7, s2, 0x10005
+; GFX1250-NEXT: s_bfe_u32 s8, s2, 0x1000b
+; GFX1250-NEXT: s_bfe_u32 s9, s2, 0x10009
+; GFX1250-NEXT: s_bfe_u32 s10, s2, 0x1000f
+; GFX1250-NEXT: s_bfe_u32 s11, s2, 0x1000d
+; GFX1250-NEXT: s_bfe_u32 s12, s2, 0x10013
+; GFX1250-NEXT: s_bfe_u32 s13, s2, 0x10011
+; GFX1250-NEXT: s_bfe_u32 s14, s2, 0x10017
+; GFX1250-NEXT: s_bfe_u32 s15, s2, 0x1001b
+; GFX1250-NEXT: s_bfe_u32 s16, s2, 0x10019
+; GFX1250-NEXT: s_lshr_b32 s17, s2, 31
+; GFX1250-NEXT: s_bfe_u32 s18, s2, 0x1001d
+; GFX1250-NEXT: s_and_b32 s35, s2, 1
+; GFX1250-NEXT: s_bfe_u32 s36, s2, 0x10002
+; GFX1250-NEXT: s_bfe_u32 s37, s2, 0x10006
+; GFX1250-NEXT: s_bfe_u32 s38, s2, 0x10004
+; GFX1250-NEXT: s_bfe_u32 s39, s2, 0x1000a
+; GFX1250-NEXT: s_bfe_u32 s40, s2, 0x10008
+; GFX1250-NEXT: s_bfe_u32 s41, s2, 0x1000e
+; GFX1250-NEXT: s_bfe_u32 s42, s2, 0x1000c
+; GFX1250-NEXT: s_bfe_u32 s43, s2, 0x10012
+; GFX1250-NEXT: s_bfe_u32 s44, s2, 0x10010
+; GFX1250-NEXT: s_bfe_u32 s45, s2, 0x10016
+; GFX1250-NEXT: s_bfe_u32 s46, s2, 0x10015
+; GFX1250-NEXT: s_bfe_u32 s47, s2, 0x10014
+; GFX1250-NEXT: s_bfe_u32 s48, s2, 0x1001a
+; GFX1250-NEXT: s_bfe_u32 s49, s2, 0x10018
+; GFX1250-NEXT: s_bfe_u32 s50, s2, 0x1001e
+; GFX1250-NEXT: s_bfe_u32 s2, s2, 0x1001c
+; GFX1250-NEXT: v_dual_mov_b32 v9, s22 :: v_dual_mov_b32 v10, s53
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v11, s21 :: v_dual_mov_b32 v12, s51
+; GFX1250-NEXT: v_dual_mov_b32 v13, s20 :: v_dual_mov_b32 v14, s52
+; GFX1250-NEXT: v_dual_mov_b32 v15, s19 :: v_dual_mov_b32 v16, s2
+; GFX1250-NEXT: v_dual_mov_b32 v17, s18 :: v_dual_mov_b32 v18, s50
+; GFX1250-NEXT: v_dual_mov_b32 v19, s17 :: v_dual_mov_b32 v20, s49
+; GFX1250-NEXT: v_dual_mov_b32 v21, s16 :: v_dual_mov_b32 v22, s48
+; GFX1250-NEXT: v_mov_b32_e32 v23, s15
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:176
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:160
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:144
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:128
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x5
+; GFX1250-NEXT: v_dual_mov_b32 v0, s47 :: v_dual_mov_b32 v1, s46
+; GFX1250-NEXT: v_dual_mov_b32 v2, s45 :: v_dual_mov_b32 v3, s14
+; GFX1250-NEXT: s_wait_xcnt 0x4
+; GFX1250-NEXT: v_dual_mov_b32 v4, s44 :: v_dual_mov_b32 v5, s13
+; GFX1250-NEXT: v_dual_mov_b32 v6, s43 :: v_dual_mov_b32 v7, s12
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v8, s42 :: v_dual_mov_b32 v9, s11
+; GFX1250-NEXT: v_dual_mov_b32 v10, s41 :: v_dual_mov_b32 v11, s10
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v12, s40 :: v_dual_mov_b32 v13, s9
+; GFX1250-NEXT: v_dual_mov_b32 v14, s39 :: v_dual_mov_b32 v15, s8
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v16, s38 :: v_dual_mov_b32 v17, s7
+; GFX1250-NEXT: v_dual_mov_b32 v18, s37 :: v_dual_mov_b32 v19, s6
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v20, s35 :: v_dual_mov_b32 v21, s5
+; GFX1250-NEXT: v_dual_mov_b32 v22, s36 :: v_dual_mov_b32 v23, s4
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
%ext = zext <64 x i1> %load to <64 x i32>
store <64 x i32> %ext, ptr addrspace(1) %out
@@ -4148,6 +4763,141 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v64i1_to_v64i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_ashr_i32 s63, s3, 31
+; GFX1250-NEXT: s_bfe_i32 s64, s3, 0x1001e
+; GFX1250-NEXT: s_bfe_i32 s65, s3, 0x1001c
+; GFX1250-NEXT: s_bfe_i32 s66, s3, 0x1001d
+; GFX1250-NEXT: s_bfe_i32 s59, s3, 0x1001b
+; GFX1250-NEXT: s_bfe_i32 s60, s3, 0x1001a
+; GFX1250-NEXT: s_bfe_i32 s61, s3, 0x10019
+; GFX1250-NEXT: s_bfe_i32 s62, s3, 0x10018
+; GFX1250-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v0, s65
+; GFX1250-NEXT: s_bfe_i32 s55, s3, 0x10017
+; GFX1250-NEXT: s_bfe_i32 s56, s3, 0x10016
+; GFX1250-NEXT: s_bfe_i32 s57, s3, 0x10015
+; GFX1250-NEXT: s_bfe_i32 s58, s3, 0x10014
+; GFX1250-NEXT: v_dual_mov_b32 v1, s66 :: v_dual_mov_b32 v2, s64
+; GFX1250-NEXT: v_dual_mov_b32 v3, s63 :: v_dual_mov_b32 v4, s62
+; GFX1250-NEXT: s_bfe_i32 s51, s3, 0x10013
+; GFX1250-NEXT: s_bfe_i32 s52, s3, 0x10012
+; GFX1250-NEXT: s_bfe_i32 s53, s3, 0x10011
+; GFX1250-NEXT: s_bfe_i32 s54, s3, 0x10010
+; GFX1250-NEXT: v_dual_mov_b32 v5, s61 :: v_dual_mov_b32 v6, s60
+; GFX1250-NEXT: v_dual_mov_b32 v7, s59 :: v_dual_mov_b32 v8, s58
+; GFX1250-NEXT: v_dual_mov_b32 v9, s57 :: v_dual_mov_b32 v10, s56
+; GFX1250-NEXT: v_dual_mov_b32 v11, s55 :: v_dual_mov_b32 v12, s54
+; GFX1250-NEXT: s_bfe_i32 s36, s3, 0x10003
+; GFX1250-NEXT: s_bfe_i32 s37, s3, 0x10002
+; GFX1250-NEXT: s_bfe_i32 s38, s3, 0x10001
+; GFX1250-NEXT: s_bfe_i32 s39, s3, 0x10000
+; GFX1250-NEXT: s_bfe_i32 s40, s3, 0x10007
+; GFX1250-NEXT: s_bfe_i32 s41, s3, 0x10006
+; GFX1250-NEXT: s_bfe_i32 s42, s3, 0x10005
+; GFX1250-NEXT: s_bfe_i32 s43, s3, 0x10004
+; GFX1250-NEXT: s_bfe_i32 s44, s3, 0x1000b
+; GFX1250-NEXT: s_bfe_i32 s45, s3, 0x1000a
+; GFX1250-NEXT: s_bfe_i32 s46, s3, 0x10009
+; GFX1250-NEXT: s_bfe_i32 s47, s3, 0x10008
+; GFX1250-NEXT: s_bfe_i32 s48, s3, 0x1000f
+; GFX1250-NEXT: s_bfe_i32 s49, s3, 0x1000e
+; GFX1250-NEXT: s_bfe_i32 s50, s3, 0x1000d
+; GFX1250-NEXT: v_dual_mov_b32 v13, s53 :: v_dual_mov_b32 v14, s52
+; GFX1250-NEXT: s_bfe_i32 s3, s3, 0x1000c
+; GFX1250-NEXT: v_mov_b32_e32 v15, s51
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:240
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:224
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:208
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:192
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s50
+; GFX1250-NEXT: v_dual_mov_b32 v2, s49 :: v_dual_mov_b32 v3, s48
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v4, s47 :: v_dual_mov_b32 v5, s46
+; GFX1250-NEXT: v_dual_mov_b32 v6, s45 :: v_dual_mov_b32 v7, s44
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_mov_b32_e32 v8, s43
+; GFX1250-NEXT: s_bfe_i32 s4, s2, 0x10003
+; GFX1250-NEXT: s_bfe_i32 s5, s2, 0x10002
+; GFX1250-NEXT: s_bfe_i32 s6, s2, 0x10001
+; GFX1250-NEXT: s_bfe_i32 s7, s2, 0x10000
+; GFX1250-NEXT: s_bfe_i32 s8, s2, 0x10007
+; GFX1250-NEXT: s_bfe_i32 s9, s2, 0x10006
+; GFX1250-NEXT: s_bfe_i32 s10, s2, 0x10005
+; GFX1250-NEXT: s_bfe_i32 s11, s2, 0x10004
+; GFX1250-NEXT: s_bfe_i32 s12, s2, 0x1000b
+; GFX1250-NEXT: s_bfe_i32 s13, s2, 0x1000a
+; GFX1250-NEXT: s_bfe_i32 s14, s2, 0x10009
+; GFX1250-NEXT: s_bfe_i32 s15, s2, 0x10008
+; GFX1250-NEXT: s_bfe_i32 s16, s2, 0x1000f
+; GFX1250-NEXT: s_bfe_i32 s17, s2, 0x1000e
+; GFX1250-NEXT: s_bfe_i32 s18, s2, 0x1000d
+; GFX1250-NEXT: s_bfe_i32 s19, s2, 0x1000c
+; GFX1250-NEXT: s_bfe_i32 s20, s2, 0x10013
+; GFX1250-NEXT: s_bfe_i32 s21, s2, 0x10012
+; GFX1250-NEXT: s_bfe_i32 s22, s2, 0x10011
+; GFX1250-NEXT: s_bfe_i32 s23, s2, 0x10010
+; GFX1250-NEXT: s_bfe_i32 s24, s2, 0x10017
+; GFX1250-NEXT: s_bfe_i32 s25, s2, 0x10016
+; GFX1250-NEXT: s_bfe_i32 s26, s2, 0x10015
+; GFX1250-NEXT: s_bfe_i32 s27, s2, 0x10014
+; GFX1250-NEXT: s_bfe_i32 s28, s2, 0x1001b
+; GFX1250-NEXT: s_bfe_i32 s29, s2, 0x1001a
+; GFX1250-NEXT: s_bfe_i32 s30, s2, 0x10019
+; GFX1250-NEXT: s_bfe_i32 s31, s2, 0x10018
+; GFX1250-NEXT: s_ashr_i32 s33, s2, 31
+; GFX1250-NEXT: s_bfe_i32 s34, s2, 0x1001e
+; GFX1250-NEXT: s_bfe_i32 s35, s2, 0x1001d
+; GFX1250-NEXT: s_bfe_i32 s2, s2, 0x1001c
+; GFX1250-NEXT: v_dual_mov_b32 v9, s42 :: v_dual_mov_b32 v10, s41
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v11, s40 :: v_dual_mov_b32 v12, s39
+; GFX1250-NEXT: v_dual_mov_b32 v13, s38 :: v_dual_mov_b32 v14, s37
+; GFX1250-NEXT: v_dual_mov_b32 v15, s36 :: v_dual_mov_b32 v16, s2
+; GFX1250-NEXT: v_dual_mov_b32 v17, s35 :: v_dual_mov_b32 v18, s34
+; GFX1250-NEXT: v_dual_mov_b32 v19, s33 :: v_dual_mov_b32 v20, s31
+; GFX1250-NEXT: v_dual_mov_b32 v21, s30 :: v_dual_mov_b32 v22, s29
+; GFX1250-NEXT: v_mov_b32_e32 v23, s28
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:176
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:160
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:144
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:128
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x5
+; GFX1250-NEXT: v_dual_mov_b32 v0, s27 :: v_dual_mov_b32 v1, s26
+; GFX1250-NEXT: v_dual_mov_b32 v2, s25 :: v_dual_mov_b32 v3, s24
+; GFX1250-NEXT: s_wait_xcnt 0x4
+; GFX1250-NEXT: v_dual_mov_b32 v4, s23 :: v_dual_mov_b32 v5, s22
+; GFX1250-NEXT: v_dual_mov_b32 v6, s21 :: v_dual_mov_b32 v7, s20
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v8, s19 :: v_dual_mov_b32 v9, s18
+; GFX1250-NEXT: v_dual_mov_b32 v10, s17 :: v_dual_mov_b32 v11, s16
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v12, s15 :: v_dual_mov_b32 v13, s14
+; GFX1250-NEXT: v_dual_mov_b32 v14, s13 :: v_dual_mov_b32 v15, s12
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v16, s11 :: v_dual_mov_b32 v17, s10
+; GFX1250-NEXT: v_dual_mov_b32 v18, s9 :: v_dual_mov_b32 v19, s8
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v20, s7 :: v_dual_mov_b32 v21, s6
+; GFX1250-NEXT: v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v23, s4
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
%ext = sext <64 x i1> %load to <64 x i32>
store <64 x i32> %ext, ptr addrspace(1) %out
@@ -4217,6 +4967,18 @@ define amdgpu_kernel void @constant_zextload_i1_to_i64(ptr addrspace(1) %out, pt
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_i1_to_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_and_b32 s2, s2, 1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%a = load i1, ptr addrspace(4) %in
%ext = zext i1 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -4287,6 +5049,19 @@ define amdgpu_kernel void @constant_sextload_i1_to_i64(ptr addrspace(1) %out, pt
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_i1_to_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%a = load i1, ptr addrspace(4) %in
%ext = sext i1 %a to i64
store i64 %ext, ptr addrspace(1) %out
@@ -4356,6 +5131,18 @@ define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v1i1_to_v1i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_and_b32 s2, s2, 1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <1 x i1>, ptr addrspace(4) %in
%ext = zext <1 x i1> %load to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -4426,6 +5213,19 @@ define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(ptr addrspace(1) %out
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v1i1_to_v1i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_u8 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <1 x i1>, ptr addrspace(4) %in
%ext = sext <1 x i1> %load to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -4508,6 +5308,23 @@ define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(ptr addrspace(1) %out
; GFX12-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v2i1_to_v2i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff, v0
+; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_lshrrev_b32_e32 v2, 1, v2
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <2 x i1>, ptr addrspace(4) %in
%ext = zext <2 x i1> %load to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -4592,6 +5409,21 @@ define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(ptr addrspace(1) %out
; GFX12-NEXT: v_ashrrev_i32_e32 v3, 31, v2
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v2i1_to_v2i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v4, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v4, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v2, 1, v0
+; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_bfe_i32 v2, v2, 0, 1
+; GFX1250-NEXT: v_dual_ashrrev_i32 v1, 31, v0 :: v_dual_ashrrev_i32 v3, 31, v2
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <2 x i1>, ptr addrspace(4) %in
%ext = sext <2 x i1> %load to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -4695,6 +5527,27 @@ define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(ptr addrspace(1) %out
; GFX12-NEXT: global_store_b64 v5, v[4:5], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v5, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v3i1_to_v3i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v5, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v5, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff, v0
+; GFX1250-NEXT: v_bfe_u32 v2, v0, 1, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_dual_lshrrev_b32 v4, 2, v1 :: v_dual_bitop2_b32 v0, 1, v0 bitop3:0x40
+; GFX1250-NEXT: v_dual_mov_b32 v1, v5 :: v_dual_mov_b32 v3, v5
+; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT: v_and_b32_e32 v4, 0xffff, v4
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b64 v5, v[4:5], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v5, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <3 x i1>, ptr addrspace(4) %in
%ext = zext <3 x i1> %load to <3 x i64>
store <3 x i64> %ext, ptr addrspace(1) %out
@@ -4805,6 +5658,28 @@ define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(ptr addrspace(1) %out
; GFX12-NEXT: global_store_b64 v6, v[4:5], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v6, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v3i1_to_v3i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v5, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v5, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_dual_lshrrev_b32 v2, 2, v0 :: v_dual_lshrrev_b32 v4, 1, v0
+; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT: v_bfe_i32 v6, v2, 0, 1
+; GFX1250-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_bfe_i32 v2, v4, 0, 1
+; GFX1250-NEXT: v_ashrrev_i32_e32 v7, 31, v6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b64 v5, v[6:7], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v5, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <3 x i1>, ptr addrspace(4) %in
%ext = sext <3 x i1> %load to <3 x i64>
store <3 x i64> %ext, ptr addrspace(1) %out
@@ -4921,6 +5796,32 @@ define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v4i1_to_v4i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v1, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_readfirstlane_b32 s2, v0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10002
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT: v_lshrrev_b32_e32 v2, 3, v0
+; GFX1250-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v0, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10001
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff, v2
+; GFX1250-NEXT: s_and_b32 s2, s2, 1
+; GFX1250-NEXT: s_and_b32 s3, 0xffff, s3
+; GFX1250-NEXT: s_and_b32 s2, 0xffff, s2
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <4 x i1>, ptr addrspace(4) %in
%ext = zext <4 x i1> %load to <4 x i64>
store <4 x i64> %ext, ptr addrspace(1) %out
@@ -5044,6 +5945,29 @@ define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(ptr addrspace(1) %out
; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v4i1_to_v4i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v9, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v9, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_dual_lshrrev_b32 v2, 3, v0 :: v_dual_lshrrev_b32 v4, 2, v0
+; GFX1250-NEXT: v_lshrrev_b32_e32 v8, 1, v0
+; GFX1250-NEXT: v_bfe_i32 v0, v0, 0, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT: v_bfe_i32 v6, v2, 0, 1
+; GFX1250-NEXT: v_bfe_i32 v4, v4, 0, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT: v_bfe_i32 v2, v8, 0, 1
+; GFX1250-NEXT: v_dual_ashrrev_i32 v1, 31, v0 :: v_dual_ashrrev_i32 v7, 31, v6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1250-NEXT: v_dual_ashrrev_i32 v5, 31, v4 :: v_dual_ashrrev_i32 v3, 31, v2
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v9, v[4:7], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v9, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <4 x i1>, ptr addrspace(4) %in
%ext = sext <4 x i1> %load to <4 x i64>
store <4 x i64> %ext, ptr addrspace(1) %out
@@ -5208,6 +6132,33 @@ define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v1, v[12:15], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v8i1_to_v8i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v12, v1, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v0, 0xffff, v12
+; GFX1250-NEXT: v_bfe_u32 v6, v12, 5, 1
+; GFX1250-NEXT: v_bfe_u32 v4, v12, 4, 1
+; GFX1250-NEXT: v_bfe_u32 v10, v12, 3, 1
+; GFX1250-NEXT: v_bfe_u32 v8, v12, 2, 1
+; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_lshrrev_b32 v2, 7, v0
+; GFX1250-NEXT: v_mov_b32_e32 v5, v1
+; GFX1250-NEXT: v_bfe_u32 v0, v0, 6, 1
+; GFX1250-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v9, v1
+; GFX1250-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v13, v1
+; GFX1250-NEXT: v_mov_b32_e32 v15, v1
+; GFX1250-NEXT: v_bfe_u32 v14, v12, 1, 1
+; GFX1250-NEXT: v_and_b32_e32 v12, 1, v12
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v1, v[12:15], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <8 x i1>, ptr addrspace(4) %in
%ext = zext <8 x i1> %load to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -5413,6 +6364,46 @@ define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(ptr addrspace(1) %out
; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v8i1_to_v8i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v16, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u8 v0, v16, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_e32 v10, s3
+; GFX1250-NEXT: s_lshr_b32 s2, s3, 6
+; GFX1250-NEXT: s_lshr_b32 s4, s3, 7
+; GFX1250-NEXT: s_lshr_b32 s6, s3, 4
+; GFX1250-NEXT: s_lshr_b32 s8, s3, 5
+; GFX1250-NEXT: s_lshr_b32 s10, s3, 2
+; GFX1250-NEXT: s_lshr_b32 s12, s3, 3
+; GFX1250-NEXT: s_lshr_b32 s14, s3, 1
+; GFX1250-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX1250-NEXT: v_bfe_i32 v12, v10, 0, 1
+; GFX1250-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1250-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7
+; GFX1250-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s9
+; GFX1250-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v9, s11
+; GFX1250-NEXT: v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v11, s13
+; GFX1250-NEXT: v_dual_mov_b32 v14, s14 :: v_dual_ashrrev_i32 v13, 31, v12
+; GFX1250-NEXT: v_mov_b32_e32 v15, s15
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <8 x i1>, ptr addrspace(4) %in
%ext = sext <8 x i1> %load to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -5701,6 +6692,49 @@ define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:64
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v16i1_to_v16i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u16 v12, v1, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_and_b32_e32 v22, 0xffff, v12
+; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_bitop2_b32 v28, 1, v12 bitop3:0x40
+; GFX1250-NEXT: v_mov_b32_e32 v5, v1
+; GFX1250-NEXT: v_bfe_u32 v0, v12, 10, 1
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1250-NEXT: v_bfe_u32 v2, v22, 11, 1
+; GFX1250-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v9, v1
+; GFX1250-NEXT: v_bfe_u32 v6, v12, 9, 1
+; GFX1250-NEXT: v_bfe_u32 v4, v22, 8, 1
+; GFX1250-NEXT: v_dual_mov_b32 v11, v1 :: v_dual_mov_b32 v13, v1
+; GFX1250-NEXT: v_dual_mov_b32 v31, v1 :: v_dual_lshrrev_b32 v10, 15, v22
+; GFX1250-NEXT: v_bfe_u32 v8, v22, 14, 1
+; GFX1250-NEXT: v_dual_mov_b32 v15, v1 :: v_dual_mov_b32 v17, v1
+; GFX1250-NEXT: v_bfe_u32 v14, v12, 13, 1
+; GFX1250-NEXT: v_bfe_u32 v18, v12, 7, 1
+; GFX1250-NEXT: v_bfe_u32 v26, v12, 3, 1
+; GFX1250-NEXT: v_bfe_u32 v30, v12, 1, 1
+; GFX1250-NEXT: v_bfe_u32 v24, v12, 2, 1
+; GFX1250-NEXT: v_bfe_u32 v20, v12, 4, 1
+; GFX1250-NEXT: v_bfe_u32 v16, v12, 6, 1
+; GFX1250-NEXT: v_bfe_u32 v12, v12, 12, 1
+; GFX1250-NEXT: v_dual_mov_b32 v19, v1 :: v_dual_mov_b32 v21, v1
+; GFX1250-NEXT: v_dual_mov_b32 v23, v1 :: v_dual_mov_b32 v25, v1
+; GFX1250-NEXT: v_dual_mov_b32 v27, v1 :: v_dual_mov_b32 v29, v1
+; GFX1250-NEXT: v_bfe_u32 v22, v22, 5, 1
+; GFX1250-NEXT: s_clause 0x7
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v1, v[8:11], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v1, v[12:15], s[0:1] offset:96
+; GFX1250-NEXT: global_store_b128 v1, v[16:19], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v1, v[20:23], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v1, v[24:27], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v1, v[28:31], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <16 x i1>, ptr addrspace(4) %in
%ext = zext <16 x i1> %load to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -6063,6 +7097,75 @@ define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v32, v[28:31], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v16i1_to_v16i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v32, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_u16 v0, v32, s[2:3]
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_readfirstlane_b32 s3, v0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_mov_b32_e32 v28, s3
+; GFX1250-NEXT: s_lshr_b32 s2, s3, 14
+; GFX1250-NEXT: s_lshr_b32 s4, s3, 15
+; GFX1250-NEXT: s_lshr_b32 s10, s3, 10
+; GFX1250-NEXT: s_lshr_b32 s12, s3, 11
+; GFX1250-NEXT: s_lshr_b32 s6, s3, 12
+; GFX1250-NEXT: s_lshr_b32 s8, s3, 13
+; GFX1250-NEXT: s_lshr_b32 s14, s3, 8
+; GFX1250-NEXT: s_lshr_b32 s16, s3, 9
+; GFX1250-NEXT: s_lshr_b32 s18, s3, 6
+; GFX1250-NEXT: s_lshr_b32 s20, s3, 7
+; GFX1250-NEXT: s_lshr_b32 s22, s3, 4
+; GFX1250-NEXT: s_lshr_b32 s24, s3, 5
+; GFX1250-NEXT: s_lshr_b32 s26, s3, 2
+; GFX1250-NEXT: s_lshr_b32 s28, s3, 3
+; GFX1250-NEXT: s_lshr_b32 s30, s3, 1
+; GFX1250-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1250-NEXT: v_dual_mov_b32 v8, s10 :: v_dual_mov_b32 v9, s11
+; GFX1250-NEXT: v_dual_mov_b32 v10, s12 :: v_dual_mov_b32 v11, s13
+; GFX1250-NEXT: v_bfe_i32 v28, v28, 0, 1
+; GFX1250-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v4, s6 :: v_dual_mov_b32 v5, s7
+; GFX1250-NEXT: v_dual_mov_b32 v6, s8 :: v_dual_mov_b32 v7, s9
+; GFX1250-NEXT: v_dual_mov_b32 v12, s14 :: v_dual_mov_b32 v13, s15
+; GFX1250-NEXT: v_dual_mov_b32 v14, s16 :: v_dual_mov_b32 v15, s17
+; GFX1250-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v16, s18 :: v_dual_mov_b32 v17, s19
+; GFX1250-NEXT: v_dual_mov_b32 v18, s20 :: v_dual_mov_b32 v19, s21
+; GFX1250-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v20, s22 :: v_dual_mov_b32 v21, s23
+; GFX1250-NEXT: v_dual_mov_b32 v22, s24 :: v_dual_mov_b32 v23, s25
+; GFX1250-NEXT: v_dual_mov_b32 v24, s26 :: v_dual_mov_b32 v25, s27
+; GFX1250-NEXT: v_dual_mov_b32 v26, s28 :: v_dual_mov_b32 v27, s29
+; GFX1250-NEXT: v_dual_mov_b32 v30, s30 :: v_dual_mov_b32 v31, s31
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v32, v[4:7], s[0:1] offset:96
+; GFX1250-NEXT: v_ashrrev_i32_e32 v29, 31, v28
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v32, v[28:31], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <16 x i1>, ptr addrspace(4) %in
%ext = sext <16 x i1> %load to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -6592,6 +7695,95 @@ define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v32i1_to_v32i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1001e
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s3
+; GFX1250-NEXT: s_lshr_b32 s4, s2, 31
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1001d
+; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, v1
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001c
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:240
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1001b
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001a
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:224
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10019
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10018
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:208
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10017
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10016
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10014
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10015
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10013
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10012
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10011
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10010
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1000f
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000e
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1000d
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000c
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x1000b
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000a
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10009
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10008
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10007
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10006
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10005
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10004
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10002
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10001
+; GFX1250-NEXT: s_and_b32 s2, s2, 1
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <32 x i1>, ptr addrspace(4) %in
%ext = zext <32 x i1> %load to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -7300,6 +8492,141 @@ define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v24, v[20:23], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v32i1_to_v32i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_lshr_b32 s34, s2, 30
+; GFX1250-NEXT: s_lshr_b32 s36, s2, 31
+; GFX1250-NEXT: s_lshr_b32 s38, s2, 28
+; GFX1250-NEXT: s_lshr_b32 s40, s2, 29
+; GFX1250-NEXT: s_lshr_b32 s42, s2, 26
+; GFX1250-NEXT: s_lshr_b32 s44, s2, 27
+; GFX1250-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s46, s2, 24
+; GFX1250-NEXT: s_lshr_b32 s48, s2, 25
+; GFX1250-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v24, 0 :: v_dual_mov_b32 v0, s34
+; GFX1250-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v1, s35 :: v_dual_mov_b32 v2, s36
+; GFX1250-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, s38
+; GFX1250-NEXT: s_lshr_b32 s26, s2, 22
+; GFX1250-NEXT: s_lshr_b32 s50, s2, 23
+; GFX1250-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v5, s39 :: v_dual_mov_b32 v6, s40
+; GFX1250-NEXT: v_dual_mov_b32 v7, s41 :: v_dual_mov_b32 v8, s42
+; GFX1250-NEXT: s_lshr_b32 s52, s2, 20
+; GFX1250-NEXT: s_lshr_b32 s54, s2, 21
+; GFX1250-NEXT: v_dual_mov_b32 v9, s43 :: v_dual_mov_b32 v10, s44
+; GFX1250-NEXT: v_dual_mov_b32 v11, s45 :: v_dual_mov_b32 v12, s46
+; GFX1250-NEXT: s_lshr_b32 s56, s2, 18
+; GFX1250-NEXT: s_lshr_b32 s58, s2, 19
+; GFX1250-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v13, s47 :: v_dual_mov_b32 v14, s48
+; GFX1250-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX1250-NEXT: v_mov_b32_e32 v15, s49
+; GFX1250-NEXT: s_lshr_b32 s60, s2, 16
+; GFX1250-NEXT: s_lshr_b32 s62, s2, 17
+; GFX1250-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s64, s2, 14
+; GFX1250-NEXT: s_lshr_b32 s66, s2, 15
+; GFX1250-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:240
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:224
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:208
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:192
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v1, s27
+; GFX1250-NEXT: v_dual_mov_b32 v2, s50 :: v_dual_mov_b32 v3, s51
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s52
+; GFX1250-NEXT: s_lshr_b32 s30, s2, 12
+; GFX1250-NEXT: s_lshr_b32 s28, s2, 13
+; GFX1250-NEXT: s_lshr_b32 s24, s2, 10
+; GFX1250-NEXT: s_lshr_b32 s22, s2, 11
+; GFX1250-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v5, s53 :: v_dual_mov_b32 v6, s54
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v7, s55 :: v_dual_mov_b32 v8, s56
+; GFX1250-NEXT: s_lshr_b32 s20, s2, 8
+; GFX1250-NEXT: s_lshr_b32 s18, s2, 9
+; GFX1250-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v9, s57 :: v_dual_mov_b32 v10, s58
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v11, s59 :: v_dual_mov_b32 v12, s60
+; GFX1250-NEXT: s_lshr_b32 s16, s2, 6
+; GFX1250-NEXT: s_lshr_b32 s14, s2, 7
+; GFX1250-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v13, s61 :: v_dual_mov_b32 v14, s62
+; GFX1250-NEXT: v_dual_mov_b32 v15, s63 :: v_dual_mov_b32 v16, s64
+; GFX1250-NEXT: s_lshr_b32 s12, s2, 4
+; GFX1250-NEXT: s_lshr_b32 s10, s2, 5
+; GFX1250-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v17, s65 :: v_dual_mov_b32 v18, s66
+; GFX1250-NEXT: v_dual_mov_b32 v19, s67 :: v_dual_mov_b32 v20, s30
+; GFX1250-NEXT: s_lshr_b32 s8, s2, 2
+; GFX1250-NEXT: s_lshr_b32 s6, s2, 3
+; GFX1250-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v21, s31 :: v_dual_mov_b32 v22, s28
+; GFX1250-NEXT: v_mov_b32_e32 v23, s29
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:176
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:160
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:144
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:128
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x5
+; GFX1250-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v1, s25
+; GFX1250-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s23
+; GFX1250-NEXT: s_wait_xcnt 0x4
+; GFX1250-NEXT: v_mov_b32_e32 v4, s20
+; GFX1250-NEXT: s_lshr_b32 s68, s2, 1
+; GFX1250-NEXT: s_bfe_i64 s[10:11], s[10:11], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v5, s21 :: v_dual_mov_b32 v6, s18
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v7, s19 :: v_dual_mov_b32 v8, s16
+; GFX1250-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v9, s17 :: v_dual_mov_b32 v10, s14
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v11, s15 :: v_dual_mov_b32 v12, s12
+; GFX1250-NEXT: s_bfe_i64 s[4:5], s[2:3], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[2:3], s[68:69], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v13, s13 :: v_dual_mov_b32 v14, s10
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v15, s11 :: v_dual_mov_b32 v16, s8
+; GFX1250-NEXT: v_dual_mov_b32 v17, s9 :: v_dual_mov_b32 v18, s6
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v19, s7 :: v_dual_mov_b32 v20, s4
+; GFX1250-NEXT: v_dual_mov_b32 v21, s5 :: v_dual_mov_b32 v22, s2
+; GFX1250-NEXT: v_mov_b32_e32 v23, s3
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <32 x i1>, ptr addrspace(4) %in
%ext = sext <32 x i1> %load to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -8327,6 +9654,179 @@ define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v64i1_to_v64i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10014
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s4
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10015
+; GFX1250-NEXT: s_lshr_b32 s4, s3, 31
+; GFX1250-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v3, v1
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001e
+; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10004
+; GFX1250-NEXT: s_and_b32 s7, s2, 1
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:416
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1001d
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001c
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:496
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1001b
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1001a
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:480
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10019
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10018
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:464
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10017
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10016
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:448
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10013
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10012
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:432
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10011
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10010
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:400
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1000f
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1000e
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:384
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1000d
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1000c
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:368
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x1000b
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x1000a
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:352
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10009
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10008
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:336
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10007
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10006
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:320
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10005
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10004
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:304
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s3, 0x10003
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10002
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:288
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_lshr_b32 s4, s2, 31
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1001e
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:272
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001d
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1001c
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:240
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1001b
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1001a
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:224
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10019
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10018
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:208
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10017
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10016
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:192
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10014
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10015
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:176
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s5
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10013
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10012
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:160
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10011
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10010
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:144
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000f
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1000e
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:128
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000d
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1000c
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:112
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x1000b
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x1000a
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10009
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10008
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:80
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_bfe_u32 s4, s2, 0x10007
+; GFX1250-NEXT: s_bfe_u32 s5, s2, 0x10006
+; GFX1250-NEXT: v_mov_b32_e32 v7, v1
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:64
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v2, s4
+; GFX1250-NEXT: s_mov_b32 s4, s3
+; GFX1250-NEXT: s_bfe_u32 s5, s3, 0x10001
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10005
+; GFX1250-NEXT: v_mov_b32_e32 v6, s5
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:48
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_bfe_u32 s3, s2, 0x10003
+; GFX1250-NEXT: s_bfe_u32 s6, s2, 0x10001
+; GFX1250-NEXT: s_bfe_u32 s2, s2, 0x10002
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:32
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: s_and_b64 s[2:3], s[4:5], 1
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s3
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s7 :: v_dual_mov_b32 v2, s6
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v1, v[4:7], s[0:1] offset:256
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
%ext = zext <64 x i1> %load to <64 x i64>
store <64 x i64> %ext, ptr addrspace(1) %out
@@ -9703,6 +11203,271 @@ define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(ptr addrspace(1) %o
; GFX12-NEXT: global_store_b128 v0, v[17:20], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v0, v[21:24], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v64i1_to_v64i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[10:11], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_lshr_b32 s96, s11, 30
+; GFX1250-NEXT: s_lshr_b32 s98, s11, 31
+; GFX1250-NEXT: s_lshr_b32 s92, s11, 28
+; GFX1250-NEXT: s_lshr_b32 s94, s11, 29
+; GFX1250-NEXT: s_lshr_b32 s78, s11, 26
+; GFX1250-NEXT: s_lshr_b32 s88, s11, 27
+; GFX1250-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[100:101], s[98:99], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s66, s11, 24
+; GFX1250-NEXT: s_lshr_b32 s74, s11, 25
+; GFX1250-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[94:95], s[94:95], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s96
+; GFX1250-NEXT: s_lshr_b32 s56, s11, 22
+; GFX1250-NEXT: s_lshr_b32 s62, s11, 23
+; GFX1250-NEXT: v_dual_mov_b32 v1, s97 :: v_dual_mov_b32 v2, s100
+; GFX1250-NEXT: v_dual_mov_b32 v3, s101 :: v_dual_mov_b32 v4, s92
+; GFX1250-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[88:89], s[88:89], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s44, s11, 20
+; GFX1250-NEXT: s_lshr_b32 s52, s11, 21
+; GFX1250-NEXT: s_lshr_b32 s30, s11, 18
+; GFX1250-NEXT: s_lshr_b32 s40, s11, 19
+; GFX1250-NEXT: s_lshr_b32 s18, s11, 16
+; GFX1250-NEXT: s_lshr_b32 s26, s11, 17
+; GFX1250-NEXT: s_lshr_b32 s2, s11, 14
+; GFX1250-NEXT: s_lshr_b32 s4, s11, 15
+; GFX1250-NEXT: v_dual_mov_b32 v5, s93 :: v_dual_mov_b32 v6, s94
+; GFX1250-NEXT: v_dual_mov_b32 v7, s95 :: v_dual_mov_b32 v10, s78
+; GFX1250-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[74:75], s[74:75], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s6, s11, 12
+; GFX1250-NEXT: s_lshr_b32 s8, s11, 13
+; GFX1250-NEXT: v_dual_mov_b32 v11, s79 :: v_dual_mov_b32 v12, s88
+; GFX1250-NEXT: v_dual_mov_b32 v13, s89 :: v_dual_mov_b32 v14, s66
+; GFX1250-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[62:63], s[62:63], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s12, s11, 10
+; GFX1250-NEXT: s_lshr_b32 s14, s11, 11
+; GFX1250-NEXT: v_dual_mov_b32 v15, s67 :: v_dual_mov_b32 v16, s74
+; GFX1250-NEXT: v_dual_mov_b32 v17, s75 :: v_dual_mov_b32 v18, s56
+; GFX1250-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[52:53], s[52:53], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[40:41], s[40:41], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[18:19], s[18:19], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[26:27], s[26:27], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[4:5], s[4:5], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s16, s11, 8
+; GFX1250-NEXT: s_lshr_b32 s20, s11, 9
+; GFX1250-NEXT: v_dual_mov_b32 v19, s57 :: v_dual_mov_b32 v20, s62
+; GFX1250-NEXT: v_dual_mov_b32 v21, s63 :: v_dual_mov_b32 v22, s44
+; GFX1250-NEXT: s_bfe_i64 s[8:9], s[8:9], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[6:7], s[6:7], 0x10000
+; GFX1250-NEXT: s_lshr_b32 s22, s11, 6
+; GFX1250-NEXT: s_lshr_b32 s24, s11, 7
+; GFX1250-NEXT: v_dual_mov_b32 v23, s45 :: v_dual_mov_b32 v24, s52
+; GFX1250-NEXT: v_dual_mov_b32 v25, s53 :: v_dual_mov_b32 v26, s30
+; GFX1250-NEXT: v_dual_mov_b32 v27, s31 :: v_dual_mov_b32 v28, s40
+; GFX1250-NEXT: v_dual_mov_b32 v29, s41 :: v_dual_mov_b32 v30, s18
+; GFX1250-NEXT: v_dual_mov_b32 v31, s19 :: v_dual_mov_b32 v32, s26
+; GFX1250-NEXT: v_mov_b32_e32 v33, s27
+; GFX1250-NEXT: s_bfe_i64 s[14:15], s[14:15], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[12:13], s[12:13], 0x10000
+; GFX1250-NEXT: s_clause 0x7
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:496
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:480
+; GFX1250-NEXT: global_store_b128 v8, v[10:13], s[0:1] offset:464
+; GFX1250-NEXT: global_store_b128 v8, v[14:17], s[0:1] offset:448
+; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:432
+; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1] offset:416
+; GFX1250-NEXT: global_store_b128 v8, v[26:29], s[0:1] offset:400
+; GFX1250-NEXT: global_store_b128 v8, v[30:33], s[0:1] offset:384
+; GFX1250-NEXT: s_wait_xcnt 0x7
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX1250-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1250-NEXT: s_wait_xcnt 0x6
+; GFX1250-NEXT: v_mov_b32_e32 v4, s6
+; GFX1250-NEXT: s_lshr_b32 s28, s11, 4
+; GFX1250-NEXT: s_lshr_b32 s34, s11, 5
+; GFX1250-NEXT: s_lshr_b32 s36, s11, 2
+; GFX1250-NEXT: s_lshr_b32 s38, s11, 3
+; GFX1250-NEXT: s_bfe_i64 s[20:21], s[20:21], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v5, s7 :: v_dual_mov_b32 v6, s8
+; GFX1250-NEXT: s_wait_xcnt 0x5
+; GFX1250-NEXT: v_dual_mov_b32 v7, s9 :: v_dual_mov_b32 v10, s12
+; GFX1250-NEXT: s_lshr_b32 s42, s11, 1
+; GFX1250-NEXT: s_mov_b32 s46, s11
+; GFX1250-NEXT: s_bfe_i64 s[24:25], s[24:25], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[22:23], s[22:23], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v11, s13 :: v_dual_mov_b32 v12, s14
+; GFX1250-NEXT: s_wait_xcnt 0x4
+; GFX1250-NEXT: v_dual_mov_b32 v13, s15 :: v_dual_mov_b32 v14, s16
+; GFX1250-NEXT: s_lshr_b32 s48, s10, 30
+; GFX1250-NEXT: s_lshr_b32 s50, s10, 31
+; GFX1250-NEXT: s_bfe_i64 s[38:39], s[38:39], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[36:37], s[36:37], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[34:35], s[34:35], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[28:29], s[28:29], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v15, s17 :: v_dual_mov_b32 v16, s20
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v17, s21 :: v_dual_mov_b32 v18, s22
+; GFX1250-NEXT: s_lshr_b32 s54, s10, 28
+; GFX1250-NEXT: s_lshr_b32 s58, s10, 29
+; GFX1250-NEXT: s_bfe_i64 s[46:47], s[46:47], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[42:43], s[42:43], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v19, s23 :: v_dual_mov_b32 v20, s24
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v21, s25 :: v_dual_mov_b32 v22, s28
+; GFX1250-NEXT: s_lshr_b32 s60, s10, 26
+; GFX1250-NEXT: s_lshr_b32 s64, s10, 27
+; GFX1250-NEXT: s_bfe_i64 s[50:51], s[50:51], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[48:49], s[48:49], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v23, s29 :: v_dual_mov_b32 v24, s34
+; GFX1250-NEXT: v_mov_b32_e32 v25, s35
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:368
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:352
+; GFX1250-NEXT: global_store_b128 v8, v[10:13], s[0:1] offset:336
+; GFX1250-NEXT: global_store_b128 v8, v[14:17], s[0:1] offset:320
+; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:304
+; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1] offset:288
+; GFX1250-NEXT: s_wait_xcnt 0x5
+; GFX1250-NEXT: v_dual_mov_b32 v0, s36 :: v_dual_mov_b32 v1, s37
+; GFX1250-NEXT: v_dual_mov_b32 v2, s38 :: v_dual_mov_b32 v3, s39
+; GFX1250-NEXT: s_wait_xcnt 0x4
+; GFX1250-NEXT: v_mov_b32_e32 v4, s46
+; GFX1250-NEXT: s_lshr_b32 s68, s10, 24
+; GFX1250-NEXT: s_lshr_b32 s70, s10, 25
+; GFX1250-NEXT: s_lshr_b32 s72, s10, 22
+; GFX1250-NEXT: s_lshr_b32 s76, s10, 23
+; GFX1250-NEXT: s_bfe_i64 s[58:59], s[58:59], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[54:55], s[54:55], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v5, s47 :: v_dual_mov_b32 v6, s42
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v7, s43 :: v_dual_mov_b32 v10, s48
+; GFX1250-NEXT: s_lshr_b32 s80, s10, 20
+; GFX1250-NEXT: s_lshr_b32 s82, s10, 21
+; GFX1250-NEXT: s_bfe_i64 s[64:65], s[64:65], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[60:61], s[60:61], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v11, s49 :: v_dual_mov_b32 v12, s50
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v13, s51 :: v_dual_mov_b32 v14, s54
+; GFX1250-NEXT: s_lshr_b32 s84, s10, 18
+; GFX1250-NEXT: s_lshr_b32 s86, s10, 19
+; GFX1250-NEXT: s_bfe_i64 s[76:77], s[76:77], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[72:73], s[72:73], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[70:71], s[70:71], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[68:69], s[68:69], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v15, s55 :: v_dual_mov_b32 v16, s58
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v17, s59 :: v_dual_mov_b32 v18, s60
+; GFX1250-NEXT: s_lshr_b32 s90, s10, 16
+; GFX1250-NEXT: s_lshr_b32 s98, s10, 17
+; GFX1250-NEXT: s_bfe_i64 s[82:83], s[82:83], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[80:81], s[80:81], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v19, s61 :: v_dual_mov_b32 v20, s64
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v21, s65 :: v_dual_mov_b32 v22, s68
+; GFX1250-NEXT: s_lshr_b32 s96, s10, 14
+; GFX1250-NEXT: s_lshr_b32 s100, s10, 15
+; GFX1250-NEXT: s_lshr_b32 s94, s10, 13
+; GFX1250-NEXT: s_lshr_b32 s88, s10, 11
+; GFX1250-NEXT: s_lshr_b32 s74, s10, 9
+; GFX1250-NEXT: s_lshr_b32 s62, s10, 7
+; GFX1250-NEXT: s_lshr_b32 s52, s10, 5
+; GFX1250-NEXT: s_lshr_b32 s40, s10, 3
+; GFX1250-NEXT: s_lshr_b32 s26, s10, 1
+; GFX1250-NEXT: s_bfe_i64 s[86:87], s[86:87], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[84:85], s[84:85], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v23, s69 :: v_dual_mov_b32 v24, s70
+; GFX1250-NEXT: v_mov_b32_e32 v25, s71
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:272
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:256
+; GFX1250-NEXT: global_store_b128 v8, v[10:13], s[0:1] offset:240
+; GFX1250-NEXT: global_store_b128 v8, v[14:17], s[0:1] offset:224
+; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:208
+; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1] offset:192
+; GFX1250-NEXT: s_wait_xcnt 0x5
+; GFX1250-NEXT: v_dual_mov_b32 v0, s72 :: v_dual_mov_b32 v1, s73
+; GFX1250-NEXT: v_dual_mov_b32 v2, s76 :: v_dual_mov_b32 v3, s77
+; GFX1250-NEXT: s_wait_xcnt 0x4
+; GFX1250-NEXT: v_mov_b32_e32 v4, s80
+; GFX1250-NEXT: s_lshr_b32 s92, s10, 12
+; GFX1250-NEXT: s_lshr_b32 s78, s10, 10
+; GFX1250-NEXT: s_bfe_i64 s[98:99], s[98:99], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[90:91], s[90:91], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v5, s81 :: v_dual_mov_b32 v6, s82
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v7, s83 :: v_dual_mov_b32 v10, s84
+; GFX1250-NEXT: s_lshr_b32 s66, s10, 8
+; GFX1250-NEXT: s_lshr_b32 s56, s10, 6
+; GFX1250-NEXT: s_lshr_b32 s44, s10, 4
+; GFX1250-NEXT: s_lshr_b32 s30, s10, 2
+; GFX1250-NEXT: s_bfe_i64 s[18:19], s[10:11], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[10:11], s[26:27], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[26:27], s[40:41], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[40:41], s[52:53], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[52:53], s[62:63], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[62:63], s[74:75], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[74:75], s[88:89], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[88:89], s[94:95], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[94:95], s[100:101], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[96:97], s[96:97], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v11, s85 :: v_dual_mov_b32 v12, s86
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v13, s87 :: v_dual_mov_b32 v14, s90
+; GFX1250-NEXT: s_bfe_i64 s[78:79], s[78:79], 0x10000
+; GFX1250-NEXT: s_bfe_i64 s[92:93], s[92:93], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v15, s91 :: v_dual_mov_b32 v16, s98
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v17, s99 :: v_dual_mov_b32 v18, s96
+; GFX1250-NEXT: s_bfe_i64 s[66:67], s[66:67], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v19, s97 :: v_dual_mov_b32 v20, s94
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v21, s95 :: v_dual_mov_b32 v22, s92
+; GFX1250-NEXT: s_bfe_i64 s[56:57], s[56:57], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v23, s93 :: v_dual_mov_b32 v24, s88
+; GFX1250-NEXT: v_mov_b32_e32 v25, s89
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:176
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:160
+; GFX1250-NEXT: global_store_b128 v8, v[10:13], s[0:1] offset:144
+; GFX1250-NEXT: global_store_b128 v8, v[14:17], s[0:1] offset:128
+; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:112
+; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x5
+; GFX1250-NEXT: v_dual_mov_b32 v0, s78 :: v_dual_mov_b32 v1, s79
+; GFX1250-NEXT: v_dual_mov_b32 v2, s74 :: v_dual_mov_b32 v3, s75
+; GFX1250-NEXT: s_wait_xcnt 0x4
+; GFX1250-NEXT: v_mov_b32_e32 v4, s66
+; GFX1250-NEXT: s_bfe_i64 s[44:45], s[44:45], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v5, s67 :: v_dual_mov_b32 v6, s62
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v7, s63 :: v_dual_mov_b32 v10, s56
+; GFX1250-NEXT: s_bfe_i64 s[30:31], s[30:31], 0x10000
+; GFX1250-NEXT: v_dual_mov_b32 v11, s57 :: v_dual_mov_b32 v12, s52
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v13, s53 :: v_dual_mov_b32 v14, s44
+; GFX1250-NEXT: v_dual_mov_b32 v15, s45 :: v_dual_mov_b32 v16, s40
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v17, s41 :: v_dual_mov_b32 v18, s30
+; GFX1250-NEXT: v_dual_mov_b32 v19, s31 :: v_dual_mov_b32 v20, s26
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v21, s27 :: v_dual_mov_b32 v22, s18
+; GFX1250-NEXT: v_dual_mov_b32 v23, s19 :: v_dual_mov_b32 v24, s10
+; GFX1250-NEXT: v_mov_b32_e32 v25, s11
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:80
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:64
+; GFX1250-NEXT: global_store_b128 v8, v[10:13], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v8, v[14:17], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v8, v[18:21], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v8, v[22:25], s[0:1]
+; GFX1250-NEXT: s_endpgm
%load = load <64 x i1>, ptr addrspace(4) %in
%ext = sext <64 x i1> %load to <64 x i64>
store <64 x i64> %ext, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
index 8862cbe6391ea..6f7ee70812264 100644
--- a/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -6,6 +6,7 @@
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefix=GFX9-HSA %s
; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx908 < %s | FileCheck -check-prefix=GFX9-HSA %s
; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GFX1250 %s
define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspace(4) %in) #0 {
; GFX6-NOHSA-LABEL: constant_load_i32:
@@ -83,6 +84,16 @@ define amdgpu_kernel void @constant_load_i32(ptr addrspace(1) %out, ptr addrspac
; GFX12-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
; GFX12-NEXT: global_store_b32 v0, v1, s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; GFX1250-NEXT: global_store_b32 v0, v1, s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load i32, ptr addrspace(4) %in
store i32 %ld, ptr addrspace(1) %out
@@ -170,6 +181,17 @@ define amdgpu_kernel void @constant_load_v2i32(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v2i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v2, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <2 x i32>, ptr addrspace(4) %in
store <2 x i32> %ld, ptr addrspace(1) %out
@@ -268,6 +290,17 @@ define amdgpu_kernel void @constant_load_v3i32(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
; GFX12-NEXT: global_store_b96 v3, v[0:2], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v3i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b96 s[4:6], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s4
+; GFX1250-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX1250-NEXT: global_store_b96 v3, v[0:2], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <3 x i32>, ptr addrspace(4) %in
store <3 x i32> %ld, ptr addrspace(1) %out
@@ -364,6 +397,18 @@ define amdgpu_kernel void @constant_load_v4i32(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v4i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_mov_b32_e32 v4, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <4 x i32>, ptr addrspace(4) %in
store <4 x i32> %ld, ptr addrspace(1) %out
@@ -497,6 +542,22 @@ define amdgpu_kernel void @constant_load_v8i32(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16
; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v8i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s4
+; GFX1250-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX1250-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v4, s0
+; GFX1250-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v6, s2
+; GFX1250-NEXT: v_mov_b32_e32 v7, s3
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[8:9]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <8 x i32>, ptr addrspace(4) %in
store <8 x i32> %ld, ptr addrspace(1) %out
@@ -660,6 +721,25 @@ define amdgpu_kernel void @constant_load_v9i32(ptr addrspace(1) %out, ptr addrsp
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16
; GFX12-NEXT: global_store_b128 v8, v[4:7], s[8:9]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v9i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b32 s12, s[10:11], 0x20
+; GFX1250-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v9, s12
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX1250-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX1250-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: global_store_b32 v8, v9, s[8:9] offset:32
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[8:9] offset:16
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[8:9]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <9 x i32>, ptr addrspace(4) %in
store <9 x i32> %ld, ptr addrspace(1) %out
@@ -829,6 +909,26 @@ define amdgpu_kernel void @constant_load_v10i32(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: global_store_b128 v10, v[0:3], s[8:9] offset:16
; GFX12-NEXT: global_store_b128 v10, v[4:7], s[8:9]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v10i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b64 s[12:13], s[10:11], 0x20
+; GFX1250-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v10, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[8:9], s[12:13]
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX1250-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX1250-NEXT: v_dual_mov_b32 v6, s2 :: v_dual_mov_b32 v7, s3
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: global_store_b64 v10, v[8:9], s[8:9] offset:32
+; GFX1250-NEXT: global_store_b128 v10, v[0:3], s[8:9] offset:16
+; GFX1250-NEXT: global_store_b128 v10, v[4:7], s[8:9]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <10 x i32>, ptr addrspace(4) %in
store <10 x i32> %ld, ptr addrspace(1) %out
@@ -1009,6 +1109,26 @@ define amdgpu_kernel void @constant_load_v11i32(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: global_store_b128 v11, v[4:7], s[8:9]
; GFX12-NEXT: global_store_b96 v11, v[8:10], s[8:9] offset:32
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v11i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX1250-NEXT: s_load_b96 s[12:14], s[10:11], 0x20
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v11, 0 :: v_dual_mov_b32 v0, s4
+; GFX1250-NEXT: v_dual_mov_b32 v1, s5 :: v_dual_mov_b32 v2, s6
+; GFX1250-NEXT: v_dual_mov_b32 v3, s7 :: v_dual_mov_b32 v4, s0
+; GFX1250-NEXT: v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v6, s2
+; GFX1250-NEXT: v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v8, s12
+; GFX1250-NEXT: v_dual_mov_b32 v9, s13 :: v_dual_mov_b32 v10, s14
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: global_store_b128 v11, v[0:3], s[8:9] offset:16
+; GFX1250-NEXT: global_store_b128 v11, v[4:7], s[8:9]
+; GFX1250-NEXT: global_store_b96 v11, v[8:10], s[8:9] offset:32
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <11 x i32>, ptr addrspace(4) %in
store <11 x i32> %ld, ptr addrspace(1) %out
@@ -1187,6 +1307,27 @@ define amdgpu_kernel void @constant_load_v12i32(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: global_store_b128 v12, v[4:7], s[8:9] offset:16
; GFX12-NEXT: global_store_b128 v12, v[8:11], s[8:9]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v12i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b128 s[12:15], s[10:11], 0x20
+; GFX1250-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX1250-NEXT: v_mov_b32_e32 v12, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_mov_b64_e32 v[0:1], s[12:13]
+; GFX1250-NEXT: v_mov_b64_e32 v[2:3], s[14:15]
+; GFX1250-NEXT: v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX1250-NEXT: v_dual_mov_b32 v6, s6 :: v_dual_mov_b32 v7, s7
+; GFX1250-NEXT: v_dual_mov_b32 v8, s0 :: v_dual_mov_b32 v9, s1
+; GFX1250-NEXT: v_dual_mov_b32 v10, s2 :: v_dual_mov_b32 v11, s3
+; GFX1250-NEXT: s_clause 0x2
+; GFX1250-NEXT: global_store_b128 v12, v[0:3], s[8:9] offset:32
+; GFX1250-NEXT: global_store_b128 v12, v[4:7], s[8:9] offset:16
+; GFX1250-NEXT: global_store_b128 v12, v[8:11], s[8:9]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <12 x i32>, ptr addrspace(4) %in
store <12 x i32> %ld, ptr addrspace(1) %out
@@ -1396,6 +1537,28 @@ define amdgpu_kernel void @constant_load_v16i32(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: global_store_b128 v16, v[8:11], s[16:17] offset:16
; GFX12-NEXT: global_store_b128 v16, v[12:15], s[16:17]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v16i32:
+; GFX1250: ; %bb.0: ; %entry
+; GFX1250-NEXT: s_load_b128 s[16:19], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v0, s12
+; GFX1250-NEXT: v_dual_mov_b32 v1, s13 :: v_dual_mov_b32 v2, s14
+; GFX1250-NEXT: v_dual_mov_b32 v3, s15 :: v_dual_mov_b32 v4, s8
+; GFX1250-NEXT: v_dual_mov_b32 v5, s9 :: v_dual_mov_b32 v6, s10
+; GFX1250-NEXT: v_dual_mov_b32 v7, s11 :: v_dual_mov_b32 v8, s4
+; GFX1250-NEXT: v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v10, s6
+; GFX1250-NEXT: v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v12, s0
+; GFX1250-NEXT: v_dual_mov_b32 v13, s1 :: v_dual_mov_b32 v14, s2
+; GFX1250-NEXT: v_mov_b32_e32 v15, s3
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v16, v[0:3], s[16:17] offset:48
+; GFX1250-NEXT: global_store_b128 v16, v[4:7], s[16:17] offset:32
+; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[16:17] offset:16
+; GFX1250-NEXT: global_store_b128 v16, v[12:15], s[16:17]
+; GFX1250-NEXT: s_endpgm
entry:
%ld = load <16 x i32>, ptr addrspace(4) %in
store <16 x i32> %ld, ptr addrspace(1) %out
@@ -1482,6 +1645,16 @@ define amdgpu_kernel void @constant_zextload_i32_to_i64(ptr addrspace(1) %out, p
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_i32_to_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load i32, ptr addrspace(4) %in
%ext = zext i32 %ld to i64
store i64 %ext, ptr addrspace(1) %out
@@ -1576,6 +1749,19 @@ define amdgpu_kernel void @constant_sextload_i32_to_i64(ptr addrspace(1) %out, p
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_i32_to_i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load i32, ptr addrspace(4) %in
%ext = sext i32 %ld to i64
store i64 %ext, ptr addrspace(1) %out
@@ -1662,6 +1848,16 @@ define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
; GFX12-NEXT: global_store_b64 v1, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v1i32_to_v1i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: global_store_b64 v1, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load <1 x i32>, ptr addrspace(4) %in
%ext = zext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -1756,6 +1952,19 @@ define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(ptr addrspace(1) %ou
; GFX12-NEXT: v_mov_b32_e32 v0, s2
; GFX12-NEXT: global_store_b64 v2, v[0:1], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v1i32_to_v1i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b32 s2, s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: s_ashr_i32 s3, s2, 31
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_mov_b32_e32 v1, s3
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load <1 x i32>, ptr addrspace(4) %in
%ext = sext <1 x i32> %ld to <1 x i64>
store <1 x i64> %ext, ptr addrspace(1) %out
@@ -1855,6 +2064,18 @@ define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(ptr addrspace(1) %ou
; GFX12-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v2i32_to_v2i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v3, v1
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load <2 x i32>, ptr addrspace(4) %in
%ext = zext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -1968,6 +2189,21 @@ define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(ptr addrspace(1) %ou
; GFX12-NEXT: v_mov_b32_e32 v2, s3
; GFX12-NEXT: global_store_b128 v4, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v2i32_to_v2i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b64 s[2:3], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v0, s2
+; GFX1250-NEXT: s_ashr_i32 s4, s3, 31
+; GFX1250-NEXT: s_ashr_i32 s5, s2, 31
+; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-NEXT: v_dual_mov_b32 v2, s3 :: v_dual_mov_b32 v1, s5
+; GFX1250-NEXT: v_mov_b32_e32 v3, s4
+; GFX1250-NEXT: global_store_b128 v4, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load <2 x i32>, ptr addrspace(4) %in
%ext = sext <2 x i32> %ld to <2 x i64>
store <2 x i64> %ext, ptr addrspace(1) %out
@@ -2099,6 +2335,21 @@ define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GFX12-NEXT: v_mov_b32_e32 v2, s5
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v4i32_to_v4i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s7
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1] offset:16
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s5
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load <4 x i32>, ptr addrspace(4) %in
%ext = zext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, ptr addrspace(1) %out
@@ -2261,6 +2512,26 @@ define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(ptr addrspace(1) %ou
; GFX12-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v8, v[4:7], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v4i32_to_v4i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b128 s[4:7], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v8, 0 :: v_dual_mov_b32 v0, s6
+; GFX1250-NEXT: s_ashr_i32 s8, s7, 31
+; GFX1250-NEXT: s_ashr_i32 s9, s6, 31
+; GFX1250-NEXT: s_ashr_i32 s2, s5, 31
+; GFX1250-NEXT: s_ashr_i32 s3, s4, 31
+; GFX1250-NEXT: v_dual_mov_b32 v2, s7 :: v_dual_mov_b32 v4, s4
+; GFX1250-NEXT: v_dual_mov_b32 v6, s5 :: v_dual_mov_b32 v1, s9
+; GFX1250-NEXT: v_dual_mov_b32 v3, s8 :: v_dual_mov_b32 v5, s3
+; GFX1250-NEXT: v_mov_b32_e32 v7, s2
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v8, v[0:3], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v8, v[4:7], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load <4 x i32>, ptr addrspace(4) %in
%ext = sext <4 x i32> %ld to <4 x i64>
store <4 x i64> %ext, ptr addrspace(1) %out
@@ -2461,6 +2732,27 @@ define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX12-NEXT: v_mov_b32_e32 v2, s1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[8:9]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v8i32_to_v8i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[8:11], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b256 s[0:7], s[10:11], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s7
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:48
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s5
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:32
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[8:9] offset:16
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s1
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[8:9]
+; GFX1250-NEXT: s_endpgm
%ld = load <8 x i32>, ptr addrspace(4) %in
%ext = zext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -2730,6 +3022,36 @@ define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(ptr addrspace(1) %ou
; GFX12-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
; GFX12-NEXT: global_store_b128 v16, v[12:15], s[0:1]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v8i32_to_v8i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b256 s[4:11], s[2:3], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v16, 0 :: v_dual_mov_b32 v0, s10
+; GFX1250-NEXT: s_ashr_i32 s16, s11, 31
+; GFX1250-NEXT: s_ashr_i32 s17, s10, 31
+; GFX1250-NEXT: s_ashr_i32 s14, s9, 31
+; GFX1250-NEXT: s_ashr_i32 s15, s8, 31
+; GFX1250-NEXT: s_ashr_i32 s12, s7, 31
+; GFX1250-NEXT: s_ashr_i32 s13, s6, 31
+; GFX1250-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v4, s8
+; GFX1250-NEXT: v_dual_mov_b32 v14, s5 :: v_dual_mov_b32 v1, s17
+; GFX1250-NEXT: v_dual_mov_b32 v3, s16 :: v_dual_mov_b32 v5, s15
+; GFX1250-NEXT: s_ashr_i32 s2, s5, 31
+; GFX1250-NEXT: s_ashr_i32 s3, s4, 31
+; GFX1250-NEXT: v_dual_mov_b32 v6, s9 :: v_dual_mov_b32 v8, s6
+; GFX1250-NEXT: v_dual_mov_b32 v7, s14 :: v_dual_mov_b32 v9, s13
+; GFX1250-NEXT: v_dual_mov_b32 v10, s7 :: v_dual_mov_b32 v12, s4
+; GFX1250-NEXT: v_dual_mov_b32 v11, s12 :: v_dual_mov_b32 v13, s3
+; GFX1250-NEXT: v_mov_b32_e32 v15, s2
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v16, v[0:3], s[0:1] offset:48
+; GFX1250-NEXT: global_store_b128 v16, v[4:7], s[0:1] offset:32
+; GFX1250-NEXT: global_store_b128 v16, v[8:11], s[0:1] offset:16
+; GFX1250-NEXT: global_store_b128 v16, v[12:15], s[0:1]
+; GFX1250-NEXT: s_endpgm
%ld = load <8 x i32>, ptr addrspace(4) %in
%ext = sext <8 x i32> %ld to <8 x i64>
store <8 x i64> %ext, ptr addrspace(1) %out
@@ -3207,6 +3529,58 @@ define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: global_store_b128 v28, v[24:27], s[16:17] offset:16
; GFX12-NEXT: global_store_b128 v28, v[0:3], s[16:17]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v16i32_to_v16i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[16:19], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v28, 0 :: v_dual_mov_b32 v0, s14
+; GFX1250-NEXT: s_ashr_i32 s28, s11, 31
+; GFX1250-NEXT: s_ashr_i32 s29, s10, 31
+; GFX1250-NEXT: s_ashr_i32 s30, s13, 31
+; GFX1250-NEXT: s_ashr_i32 s33, s15, 31
+; GFX1250-NEXT: s_ashr_i32 s34, s14, 31
+; GFX1250-NEXT: s_ashr_i32 s26, s9, 31
+; GFX1250-NEXT: s_ashr_i32 s27, s8, 31
+; GFX1250-NEXT: s_ashr_i32 s31, s12, 31
+; GFX1250-NEXT: s_ashr_i32 s24, s7, 31
+; GFX1250-NEXT: s_ashr_i32 s25, s6, 31
+; GFX1250-NEXT: v_dual_mov_b32 v2, s15 :: v_dual_mov_b32 v4, s12
+; GFX1250-NEXT: v_dual_mov_b32 v6, s13 :: v_dual_mov_b32 v8, s10
+; GFX1250-NEXT: v_dual_mov_b32 v10, s11 :: v_dual_mov_b32 v12, s8
+; GFX1250-NEXT: v_dual_mov_b32 v26, s3 :: v_dual_mov_b32 v1, s34
+; GFX1250-NEXT: v_dual_mov_b32 v3, s33 :: v_dual_mov_b32 v5, s31
+; GFX1250-NEXT: v_dual_mov_b32 v7, s30 :: v_dual_mov_b32 v9, s29
+; GFX1250-NEXT: v_dual_mov_b32 v11, s28 :: v_dual_mov_b32 v13, s27
+; GFX1250-NEXT: s_ashr_i32 s22, s5, 31
+; GFX1250-NEXT: s_ashr_i32 s23, s4, 31
+; GFX1250-NEXT: v_dual_mov_b32 v14, s9 :: v_dual_mov_b32 v16, s6
+; GFX1250-NEXT: v_dual_mov_b32 v15, s26 :: v_dual_mov_b32 v17, s25
+; GFX1250-NEXT: s_ashr_i32 s20, s3, 31
+; GFX1250-NEXT: s_ashr_i32 s21, s2, 31
+; GFX1250-NEXT: v_dual_mov_b32 v18, s7 :: v_dual_mov_b32 v20, s4
+; GFX1250-NEXT: v_dual_mov_b32 v19, s24 :: v_dual_mov_b32 v21, s23
+; GFX1250-NEXT: s_ashr_i32 s18, s1, 31
+; GFX1250-NEXT: s_ashr_i32 s19, s0, 31
+; GFX1250-NEXT: v_dual_mov_b32 v22, s5 :: v_dual_mov_b32 v24, s2
+; GFX1250-NEXT: v_dual_mov_b32 v23, s22 :: v_dual_mov_b32 v25, s21
+; GFX1250-NEXT: v_mov_b32_e32 v27, s20
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: global_store_b128 v28, v[0:3], s[16:17] offset:112
+; GFX1250-NEXT: global_store_b128 v28, v[4:7], s[16:17] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s19
+; GFX1250-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v3, s18
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v28, v[8:11], s[16:17] offset:80
+; GFX1250-NEXT: global_store_b128 v28, v[12:15], s[16:17] offset:64
+; GFX1250-NEXT: global_store_b128 v28, v[16:19], s[16:17] offset:48
+; GFX1250-NEXT: global_store_b128 v28, v[20:23], s[16:17] offset:32
+; GFX1250-NEXT: global_store_b128 v28, v[24:27], s[16:17] offset:16
+; GFX1250-NEXT: global_store_b128 v28, v[0:3], s[16:17]
+; GFX1250-NEXT: s_endpgm
%ld = load <16 x i32>, ptr addrspace(4) %in
%ext = sext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -3551,6 +3925,39 @@ define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(ptr addrspace(1) %
; GFX12-NEXT: v_mov_b32_e32 v2, s1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[16:17]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v16i32_to_v16i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[16:19], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b512 s[0:15], s[18:19], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s14
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, s15
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:112
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v2, s13
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v2, s11
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:80
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v2, s9
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:64
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v2, s7
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:48
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s5
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:32
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[16:17] offset:16
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s1
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[16:17]
+; GFX1250-NEXT: s_endpgm
%ld = load <16 x i32>, ptr addrspace(4) %in
%ext = zext <16 x i32> %ld to <16 x i64>
store <16 x i64> %ext, ptr addrspace(1) %out
@@ -4460,6 +4867,113 @@ define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:16
; GFX12-NEXT: global_store_b128 v24, v[20:23], s[36:37]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_sextload_v32i32_to_v32i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[36:39], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b512 s[0:15], s[38:39], 0x0
+; GFX1250-NEXT: s_load_b512 s[16:31], s[38:39], 0x40
+; GFX1250-NEXT: v_mov_b32_e32 v24, 0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_ashr_i32 s49, s15, 31
+; GFX1250-NEXT: s_ashr_i32 s64, s31, 31
+; GFX1250-NEXT: s_ashr_i32 s65, s30, 31
+; GFX1250-NEXT: s_ashr_i32 s62, s29, 31
+; GFX1250-NEXT: s_ashr_i32 s63, s28, 31
+; GFX1250-NEXT: s_ashr_i32 s60, s27, 31
+; GFX1250-NEXT: s_ashr_i32 s61, s26, 31
+; GFX1250-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v2, s31
+; GFX1250-NEXT: v_dual_mov_b32 v4, s28 :: v_dual_mov_b32 v1, s65
+; GFX1250-NEXT: v_mov_b32_e32 v3, s64
+; GFX1250-NEXT: s_ashr_i32 s58, s25, 31
+; GFX1250-NEXT: s_ashr_i32 s59, s24, 31
+; GFX1250-NEXT: v_dual_mov_b32 v6, s29 :: v_dual_mov_b32 v8, s26
+; GFX1250-NEXT: v_dual_mov_b32 v5, s63 :: v_dual_mov_b32 v7, s62
+; GFX1250-NEXT: v_dual_mov_b32 v9, s61 :: v_dual_mov_b32 v10, s27
+; GFX1250-NEXT: v_dual_mov_b32 v11, s60 :: v_dual_mov_b32 v12, s24
+; GFX1250-NEXT: s_ashr_i32 s57, s23, 31
+; GFX1250-NEXT: v_dual_mov_b32 v13, s59 :: v_dual_mov_b32 v14, s25
+; GFX1250-NEXT: v_mov_b32_e32 v15, s58
+; GFX1250-NEXT: s_ashr_i32 s24, s22, 31
+; GFX1250-NEXT: s_ashr_i32 s55, s21, 31
+; GFX1250-NEXT: s_ashr_i32 s56, s20, 31
+; GFX1250-NEXT: s_ashr_i32 s53, s19, 31
+; GFX1250-NEXT: s_ashr_i32 s54, s18, 31
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:240
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:224
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:208
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:192
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v1, s24
+; GFX1250-NEXT: v_dual_mov_b32 v2, s23 :: v_dual_mov_b32 v3, s57
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_mov_b32_e32 v4, s20
+; GFX1250-NEXT: s_ashr_i32 s51, s17, 31
+; GFX1250-NEXT: s_ashr_i32 s52, s16, 31
+; GFX1250-NEXT: v_dual_mov_b32 v5, s56 :: v_dual_mov_b32 v6, s21
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v7, s55 :: v_dual_mov_b32 v8, s18
+; GFX1250-NEXT: s_ashr_i32 s50, s14, 31
+; GFX1250-NEXT: v_dual_mov_b32 v9, s54 :: v_dual_mov_b32 v10, s19
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v11, s53 :: v_dual_mov_b32 v12, s16
+; GFX1250-NEXT: s_ashr_i32 s45, s11, 31
+; GFX1250-NEXT: s_ashr_i32 s46, s10, 31
+; GFX1250-NEXT: s_ashr_i32 s47, s13, 31
+; GFX1250-NEXT: s_ashr_i32 s48, s12, 31
+; GFX1250-NEXT: v_dual_mov_b32 v13, s52 :: v_dual_mov_b32 v14, s17
+; GFX1250-NEXT: v_dual_mov_b32 v15, s51 :: v_dual_mov_b32 v16, s14
+; GFX1250-NEXT: s_ashr_i32 s43, s9, 31
+; GFX1250-NEXT: s_ashr_i32 s44, s8, 31
+; GFX1250-NEXT: v_dual_mov_b32 v17, s50 :: v_dual_mov_b32 v18, s15
+; GFX1250-NEXT: v_dual_mov_b32 v19, s49 :: v_dual_mov_b32 v20, s12
+; GFX1250-NEXT: s_ashr_i32 s41, s7, 31
+; GFX1250-NEXT: s_ashr_i32 s42, s6, 31
+; GFX1250-NEXT: v_dual_mov_b32 v21, s48 :: v_dual_mov_b32 v22, s13
+; GFX1250-NEXT: v_mov_b32_e32 v23, s47
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:176
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:160
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:144
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:128
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:112
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[36:37] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x5
+; GFX1250-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v1, s46
+; GFX1250-NEXT: v_dual_mov_b32 v2, s11 :: v_dual_mov_b32 v3, s45
+; GFX1250-NEXT: s_wait_xcnt 0x4
+; GFX1250-NEXT: v_mov_b32_e32 v4, s8
+; GFX1250-NEXT: s_ashr_i32 s39, s5, 31
+; GFX1250-NEXT: s_ashr_i32 s40, s4, 31
+; GFX1250-NEXT: v_dual_mov_b32 v5, s44 :: v_dual_mov_b32 v6, s9
+; GFX1250-NEXT: s_wait_xcnt 0x3
+; GFX1250-NEXT: v_dual_mov_b32 v7, s43 :: v_dual_mov_b32 v8, s6
+; GFX1250-NEXT: s_ashr_i32 s35, s3, 31
+; GFX1250-NEXT: s_ashr_i32 s38, s2, 31
+; GFX1250-NEXT: v_dual_mov_b32 v9, s42 :: v_dual_mov_b32 v10, s7
+; GFX1250-NEXT: s_wait_xcnt 0x2
+; GFX1250-NEXT: v_dual_mov_b32 v11, s41 :: v_dual_mov_b32 v12, s4
+; GFX1250-NEXT: s_ashr_i32 s33, s1, 31
+; GFX1250-NEXT: s_ashr_i32 s34, s0, 31
+; GFX1250-NEXT: v_dual_mov_b32 v13, s40 :: v_dual_mov_b32 v14, s5
+; GFX1250-NEXT: s_wait_xcnt 0x1
+; GFX1250-NEXT: v_dual_mov_b32 v15, s39 :: v_dual_mov_b32 v16, s2
+; GFX1250-NEXT: v_dual_mov_b32 v17, s38 :: v_dual_mov_b32 v18, s3
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v19, s35 :: v_dual_mov_b32 v20, s0
+; GFX1250-NEXT: v_dual_mov_b32 v21, s34 :: v_dual_mov_b32 v22, s1
+; GFX1250-NEXT: v_mov_b32_e32 v23, s33
+; GFX1250-NEXT: s_clause 0x5
+; GFX1250-NEXT: global_store_b128 v24, v[0:3], s[36:37] offset:80
+; GFX1250-NEXT: global_store_b128 v24, v[4:7], s[36:37] offset:64
+; GFX1250-NEXT: global_store_b128 v24, v[8:11], s[36:37] offset:48
+; GFX1250-NEXT: global_store_b128 v24, v[12:15], s[36:37] offset:32
+; GFX1250-NEXT: global_store_b128 v24, v[16:19], s[36:37] offset:16
+; GFX1250-NEXT: global_store_b128 v24, v[20:23], s[36:37]
+; GFX1250-NEXT: s_endpgm
%ld = load <32 x i32>, ptr addrspace(4) %in
%ext = sext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -5100,6 +5614,65 @@ define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(ptr addrspace(1) %
; GFX12-NEXT: v_mov_b32_e32 v2, s1
; GFX12-NEXT: global_store_b128 v1, v[0:3], s[36:37]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_zextload_v32i32_to_v32i64:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[36:39], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_load_b512 s[16:31], s[38:39], 0x40
+; GFX1250-NEXT: v_mov_b32_e32 v1, 0
+; GFX1250-NEXT: s_load_b512 s[0:15], s[38:39], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-NEXT: v_dual_mov_b32 v0, s30 :: v_dual_mov_b32 v3, v1
+; GFX1250-NEXT: v_mov_b32_e32 v2, s31
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:240
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s28 :: v_dual_mov_b32 v2, s29
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:224
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s26 :: v_dual_mov_b32 v2, s27
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:208
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s24 :: v_dual_mov_b32 v2, s25
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:192
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s22 :: v_dual_mov_b32 v2, s23
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:176
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v2, s21
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:160
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s18 :: v_dual_mov_b32 v2, s19
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:144
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v2, s17
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:128
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s14 :: v_dual_mov_b32 v2, s15
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:112
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s12 :: v_dual_mov_b32 v2, s13
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:96
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s10 :: v_dual_mov_b32 v2, s11
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:80
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v2, s9
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:64
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v2, s7
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:48
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v2, s5
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:32
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v2, s3
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37] offset:16
+; GFX1250-NEXT: s_wait_xcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v2, s1
+; GFX1250-NEXT: global_store_b128 v1, v[0:3], s[36:37]
+; GFX1250-NEXT: s_endpgm
%ld = load <32 x i32>, ptr addrspace(4) %in
%ext = zext <32 x i32> %ld to <32 x i64>
store <32 x i64> %ext, ptr addrspace(1) %out
@@ -5472,6 +6045,42 @@ define amdgpu_kernel void @constant_load_v32i32(ptr addrspace(1) %out, ptr addrs
; GFX12-NEXT: global_store_b128 v32, v[24:27], s[36:37] offset:16
; GFX12-NEXT: global_store_b128 v32, v[28:31], s[36:37]
; GFX12-NEXT: s_endpgm
+;
+; GFX1250-LABEL: constant_load_v32i32:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[36:39], s[4:5], 0x24
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: s_clause 0x1
+; GFX1250-NEXT: s_load_b512 s[16:31], s[38:39], 0x40
+; GFX1250-NEXT: s_load_b512 s[0:15], s[38:39], 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_dual_mov_b32 v32, 0 :: v_dual_mov_b32 v0, s28
+; GFX1250-NEXT: v_dual_mov_b32 v1, s29 :: v_dual_mov_b32 v2, s30
+; GFX1250-NEXT: v_dual_mov_b32 v3, s31 :: v_dual_mov_b32 v4, s24
+; GFX1250-NEXT: v_dual_mov_b32 v5, s25 :: v_dual_mov_b32 v6, s26
+; GFX1250-NEXT: v_dual_mov_b32 v7, s27 :: v_dual_mov_b32 v8, s20
+; GFX1250-NEXT: v_dual_mov_b32 v9, s21 :: v_dual_mov_b32 v10, s22
+; GFX1250-NEXT: v_dual_mov_b32 v11, s23 :: v_dual_mov_b32 v12, s16
+; GFX1250-NEXT: v_dual_mov_b32 v13, s17 :: v_dual_mov_b32 v14, s18
+; GFX1250-NEXT: v_dual_mov_b32 v15, s19 :: v_dual_mov_b32 v16, s12
+; GFX1250-NEXT: v_dual_mov_b32 v17, s13 :: v_dual_mov_b32 v18, s14
+; GFX1250-NEXT: v_dual_mov_b32 v19, s15 :: v_dual_mov_b32 v20, s8
+; GFX1250-NEXT: v_dual_mov_b32 v21, s9 :: v_dual_mov_b32 v22, s10
+; GFX1250-NEXT: v_dual_mov_b32 v23, s11 :: v_dual_mov_b32 v24, s4
+; GFX1250-NEXT: v_dual_mov_b32 v25, s5 :: v_dual_mov_b32 v26, s6
+; GFX1250-NEXT: v_dual_mov_b32 v27, s7 :: v_dual_mov_b32 v28, s0
+; GFX1250-NEXT: v_dual_mov_b32 v29, s1 :: v_dual_mov_b32 v30, s2
+; GFX1250-NEXT: v_mov_b32_e32 v31, s3
+; GFX1250-NEXT: s_clause 0x7
+; GFX1250-NEXT: global_store_b128 v32, v[0:3], s[36:37] offset:112
+; GFX1250-NEXT: global_store_b128 v32, v[4:7], s[36:37] offset:96
+; GFX1250-NEXT: global_store_b128 v32, v[8:11], s[36:37] offset:80
+; GFX1250-NEXT: global_store_b128 v32, v[12:15], s[36:37] offset:64
+; GFX1250-NEXT: global_store_b128 v32, v[16:19], s[36:37] offset:48
+; GFX1250-NEXT: global_store_b128 v32, v[20:23], s[36:37] offset:32
+; GFX1250-NEXT: global_store_b128 v32, v[24:27], s[36:37] offset:16
+; GFX1250-NEXT: global_store_b128 v32, v[28:31], s[36:37]
+; GFX1250-NEXT: s_endpgm
%ld = load <32 x i32>, ptr addrspace(4) %in
store <32 x i32> %ld, ptr addrspace(1) %out
ret void
More information about the llvm-commits
mailing list