[llvm] AMDGPU: Add packed fp32 instructions for gfx1250 (PR #150253)
via llvm-commits
llvm-commits at lists.llvm.org
Wed Jul 23 09:09:33 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-amdgpu
Author: Changpeng Fang (changpeng)
<details>
<summary>Changes</summary>
---
Patch is 104.15 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/150253.diff
4 Files Affected:
- (modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+4)
- (modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+1216)
- (modified) llvm/test/MC/AMDGPU/gfx1250_asm_vop3p.s (+244)
- (modified) llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3p.txt (+141)
``````````diff
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index a259e5cf4bd59..ed8e547419ceb 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -2210,6 +2210,10 @@ multiclass VOP3P_Real_with_name_gfx12<bits<8> op,
defm V_PK_MIN_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1b, "V_PK_MIN_F16", "v_pk_min_num_f16">;
defm V_PK_MAX_NUM_F16 : VOP3P_Real_with_name_gfx12<0x1c, "V_PK_MAX_F16", "v_pk_max_num_f16">;
+defm V_PK_FMA_F32 : VOP3P_Real_gfx12<0x1f>;
+defm V_PK_MUL_F32 : VOP3P_Real_gfx12<0x28>;
+defm V_PK_ADD_F32 : VOP3P_Real_gfx12<0x29>;
+
defm V_PK_ADD_MAX_I16 : VOP3P_Real_gfx1250<0x14>;
defm V_PK_ADD_MAX_U16 : VOP3P_Real_gfx1250<0x15>;
defm V_PK_ADD_MIN_I16 : VOP3P_Real_gfx1250<0x2d>;
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 0e1e5e4c4987c..16639948c8b12 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -4,6 +4,8 @@
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX90A-GISEL %s
; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG,GFX942-SDAG %s
; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL,GFX942-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -enable-var-scope -check-prefixes=GFX1250,GFX1250-GISEL %s
define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
; GFX900-LABEL: fadd_v2_vv:
@@ -29,6 +31,17 @@ define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1]
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fadd_v2_vv:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], v[0:1]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -61,6 +74,17 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fadd_v2_vs:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -112,6 +136,34 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
; PACKED-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx4 v4, v[0:3], s[6:7]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v4_vs:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1]
+; GFX1250-SDAG-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v4_vs:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_clause 0x1
+; GFX1250-GISEL-NEXT: s_load_b64 s[6:7], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v4, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v4, s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[0:1]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[2:3]
+; GFX1250-GISEL-NEXT: global_store_b128 v4, v[0:3], s[6:7] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
%load = load <4 x float>, ptr addrspace(1) %gep, align 16
@@ -277,6 +329,115 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
; PACKED-GISEL-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v32_vs:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-SDAG-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: s_clause 0x7
+; GFX1250-SDAG-NEXT: global_load_b128 v[0:3], v32, s[0:1] offset:16
+; GFX1250-SDAG-NEXT: global_load_b128 v[4:7], v32, s[0:1]
+; GFX1250-SDAG-NEXT: global_load_b128 v[8:11], v32, s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_load_b128 v[20:23], v32, s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_load_b128 v[12:15], v32, s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_load_b128 v[16:19], v32, s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_load_b128 v[24:27], v32, s[0:1] offset:112
+; GFX1250-SDAG-NEXT: global_load_b128 v[28:31], v32, s[0:1] offset:96
+; GFX1250-SDAG-NEXT: s_clause 0x1
+; GFX1250-SDAG-NEXT: s_load_b512 s[8:23], s[4:5], 0xa4
+; GFX1250-SDAG-NEXT: s_load_b512 s[36:51], s[4:5], 0xe4
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x7
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[12:13]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[14:15]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x6
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[10:11]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x4
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[16:17]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x3
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[40:41]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x2
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[38:39]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x1
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[48:49]
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[44:45]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[46:47]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[50:51]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[36:37]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[42:43]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[18:19]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[20:21]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[22:23]
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[8:9]
+; GFX1250-SDAG-NEXT: s_clause 0x7
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[28:31], s[0:1] offset:96
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[24:27], s[0:1] offset:112
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[16:19], s[0:1] offset:64
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[12:15], s[0:1] offset:80
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[20:23], s[0:1] offset:32
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[8:11], s[0:1] offset:48
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[4:7], s[0:1]
+; GFX1250-SDAG-NEXT: global_store_b128 v32, v[0:3], s[0:1] offset:16
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v32_vs:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[34:35], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_lshlrev_b32_e32 v32, 7, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: s_clause 0x7
+; GFX1250-GISEL-NEXT: global_load_b128 v[0:3], v32, s[34:35]
+; GFX1250-GISEL-NEXT: global_load_b128 v[4:7], v32, s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_load_b128 v[8:11], v32, s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_load_b128 v[12:15], v32, s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_load_b128 v[16:19], v32, s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_load_b128 v[20:23], v32, s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_load_b128 v[24:27], v32, s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_load_b128 v[28:31], v32, s[34:35] offset:112
+; GFX1250-GISEL-NEXT: s_load_b512 s[16:31], s[4:5], 0xa4
+; GFX1250-GISEL-NEXT: s_wait_xcnt 0x0
+; GFX1250-GISEL-NEXT: s_load_b512 s[0:15], s[4:5], 0xe4
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x7
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[16:17]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], s[18:19]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x6
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[4:5], v[4:5], s[20:21]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[6:7], v[6:7], s[22:23]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x5
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[8:9], v[8:9], s[24:25]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[10:11], v[10:11], s[26:27]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x4
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[12:13], v[12:13], s[28:29]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[14:15], v[14:15], s[30:31]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x3
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[16:17], v[16:17], s[0:1]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[18:19], v[18:19], s[2:3]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x2
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[20:21], v[20:21], s[4:5]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[22:23], v[22:23], s[6:7]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x1
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[24:25], v[24:25], s[8:9]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[26:27], v[26:27], s[10:11]
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[28:29], v[28:29], s[12:13]
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[30:31], v[30:31], s[14:15]
+; GFX1250-GISEL-NEXT: s_clause 0x7
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[0:3], s[34:35]
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[4:7], s[34:35] offset:16
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[8:11], s[34:35] offset:32
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[12:15], s[34:35] offset:48
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[16:19], s[34:35] offset:64
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[20:23], s[34:35] offset:80
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[24:27], s[34:35] offset:96
+; GFX1250-GISEL-NEXT: global_store_b128 v32, v[28:31], s[34:35] offset:112
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
%load = load <32 x float>, ptr addrspace(1) %gep, align 128
@@ -325,6 +486,32 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v2_v_imm:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_mov_b32 s2, 0x42c80000
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_imm:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, 0x42c80000
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -370,6 +557,30 @@ define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) {
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[2:3], v[0:1]
; PACKED-GISEL-NEXT: global_store_dwordx2 v4, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v2_v_v_splat:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[0:1] op_sel_hi:[1,0]
+; GFX1250-SDAG-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_v_splat:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1250-GISEL-NEXT: v_mov_b32_e32 v1, v0
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[2:3], v0, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[2:3], v[2:3], v[0:1]
+; GFX1250-GISEL-NEXT: global_store_b64 v0, v[2:3], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -419,6 +630,31 @@ define amdgpu_kernel void @fadd_v2_v_lit_splat(ptr addrspace(1) %a) {
; PACKED-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
; PACKED-GISEL-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-GISEL-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v2_v_lit_splat:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0 op_sel_hi:[1,0]
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_lit_splat:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b32 s2, 1.0
+; GFX1250-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1250-GISEL-NEXT: s_mov_b32 s3, s2
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -452,6 +688,29 @@ define amdgpu_kernel void @fadd_v2_v_lit_hi0(ptr addrspace(1) %a) {
; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
+;
+; GFX1250-SDAG-LABEL: fadd_v2_v_lit_hi0:
+; GFX1250-SDAG: ; %bb.0:
+; GFX1250-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-SDAG-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-SDAG-NEXT: s_wait_kmcnt 0x0
+; GFX1250-SDAG-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX1250-SDAG-NEXT: v_pk_add_f32 v[0:1], v[0:1], 1.0
+; GFX1250-SDAG-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-SDAG-NEXT: s_endpgm
+;
+; GFX1250-GISEL-LABEL: fadd_v2_v_lit_hi0:
+; GFX1250-GISEL: ; %bb.0:
+; GFX1250-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-GISEL-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-GISEL-NEXT: s_mov_b64 s[2:3], 0x3f800000
+; GFX1250-GISEL-NEXT: s_wait_kmcnt 0x0
+; GFX1250-GISEL-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX1250-GISEL-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; GFX1250-GISEL-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-GISEL-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -486,6 +745,18 @@ define amdgpu_kernel void @fadd_v2_v_lit_lo0(ptr addrspace(1) %a) {
; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fadd_v2_v_lit_lo0:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x3f80000000000000)
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, v[0:1], s[0:1] scale_offset
+; GFX1250-NEXT: s_endpgm
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
%load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -520,6 +791,18 @@ define amdgpu_kernel void @fadd_v2_v_unfoldable_lit(ptr addrspace(1) %a) {
; PACKED-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
; PACKED-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1]
; PACKED-NEXT: s_endpgm
+;
+; GFX1250-LABEL: fadd_v2_v_unfoldable_lit:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
+; GFX1250-NEXT: v_and_b32_e32 v2, 0x3ff, v0
+; GFX1250-NEXT: s_mov_b64 s[2:3], lit64(0x400000003f800000)
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: global_load_b64 v[0:1], v2, s[0:1] scale_offset
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; GFX1250-NEXT: global_store_b64 v2, ...
[truncated]
``````````
</details>
https://github.com/llvm/llvm-project/pull/150253
More information about the llvm-commits
mailing list