[llvm] [NFC][AMDGPU] Auto generate check lines for `llvm/test/CodeGen/AMDGPU/packed-fp32.ll` (PR #131629)

via llvm-commits llvm-commits at lists.llvm.org
Mon Mar 17 08:36:19 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Shilei Tian (shiltian)

<details>
<summary>Changes</summary>



---

Patch is 114.50 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/131629.diff


1 Files Affected:

- (modified) llvm/test/CodeGen/AMDGPU/packed-fp32.ll (+1810-193) 


``````````diff
diff --git a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
index 2004e1eb061bf..28a995e74f7ab 100644
--- a/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/packed-fp32.ll
@@ -1,13 +1,34 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX900 %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-GISEL %s
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-SDAG %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,PACKED,PACKED-GISEL %s
-
-; GCN-LABEL: {{^}}fadd_v2_vv:
-; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; PACKED:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}]
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX900 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=PACKED,PACKED-GISEL %s
+
 define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
+; GFX900-LABEL: fadd_v2_vv:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v1, v1, v1
+; GFX900-NEXT:    v_add_f32_e32 v0, v0, v0
+; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT:    s_endpgm
+;
+; PACKED-LABEL: fadd_v2_vv:
+; PACKED:       ; %bb.0:
+; PACKED-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    v_pk_add_f32 v[0:1], v[0:1], v[0:1]
+; PACKED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -16,10 +37,30 @@ define amdgpu_kernel void @fadd_v2_vv(ptr addrspace(1) %a) {
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_v2_vs:
-; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED:         v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
 define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
+; GFX900-LABEL: fadd_v2_vs:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v1, s3, v1
+; GFX900-NEXT:    v_add_f32_e32 v0, s2, v0
+; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT:    s_endpgm
+;
+; PACKED-LABEL: fadd_v2_vs:
+; PACKED:       ; %bb.0:
+; PACKED-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; PACKED-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; PACKED-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -28,10 +69,49 @@ define amdgpu_kernel void @fadd_v2_vs(ptr addrspace(1) %a, <2 x float> %x) {
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_v4_vs:
-; GFX900-COUNT-4: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED-COUNT-2: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
 define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
+; GFX900-LABEL: fadd_v4_vs:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; GFX900-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; GFX900-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v3, s3, v3
+; GFX900-NEXT:    v_add_f32_e32 v2, s2, v2
+; GFX900-NEXT:    v_add_f32_e32 v1, s1, v1
+; GFX900-NEXT:    v_add_f32_e32 v0, s0, v0
+; GFX900-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; GFX900-NEXT:    s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v4_vs:
+; PACKED-SDAG:       ; %bb.0:
+; PACKED-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; PACKED-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[2:3]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[0:1]
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; PACKED-SDAG-NEXT:    s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v4_vs:
+; PACKED-GISEL:       ; %bb.0:
+; PACKED-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x24
+; PACKED-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x34
+; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 4, v0
+; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[0:3], v4, s[6:7]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[0:1]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[2:3]
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v4, v[0:3], s[6:7]
+; PACKED-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <4 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <4 x float>, ptr addrspace(1) %gep, align 16
@@ -40,10 +120,163 @@ define amdgpu_kernel void @fadd_v4_vs(ptr addrspace(1) %a, <4 x float> %x) {
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_v32_vs:
-; GFX900-COUNT-32: v_add_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-; PACKED-COUNT-16: v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[{{[0-9:]+}}]{{$}}
 define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
+; GFX900-LABEL: fadd_v32_vs:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v0, 7, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx4 v[1:4], v0, s[0:1] offset:16
+; GFX900-NEXT:    global_load_dwordx4 v[5:8], v0, s[0:1]
+; GFX900-NEXT:    global_load_dwordx4 v[9:12], v0, s[0:1] offset:48
+; GFX900-NEXT:    global_load_dwordx4 v[13:16], v0, s[0:1] offset:32
+; GFX900-NEXT:    global_load_dwordx4 v[17:20], v0, s[0:1] offset:80
+; GFX900-NEXT:    global_load_dwordx4 v[21:24], v0, s[0:1] offset:64
+; GFX900-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; GFX900-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; GFX900-NEXT:    global_load_dwordx4 v[25:28], v0, s[0:1] offset:112
+; GFX900-NEXT:    global_load_dwordx4 v[29:32], v0, s[0:1] offset:96
+; GFX900-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v4, s43, v4
+; GFX900-NEXT:    v_add_f32_e32 v3, s42, v3
+; GFX900-NEXT:    v_add_f32_e32 v2, s41, v2
+; GFX900-NEXT:    v_add_f32_e32 v1, s40, v1
+; GFX900-NEXT:    s_waitcnt vmcnt(6)
+; GFX900-NEXT:    v_add_f32_e32 v8, s39, v8
+; GFX900-NEXT:    v_add_f32_e32 v7, s38, v7
+; GFX900-NEXT:    v_add_f32_e32 v6, s37, v6
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v32, s19, v32
+; GFX900-NEXT:    v_add_f32_e32 v31, s18, v31
+; GFX900-NEXT:    v_add_f32_e32 v30, s17, v30
+; GFX900-NEXT:    v_add_f32_e32 v29, s16, v29
+; GFX900-NEXT:    v_add_f32_e32 v5, s36, v5
+; GFX900-NEXT:    v_add_f32_e32 v12, s51, v12
+; GFX900-NEXT:    v_add_f32_e32 v11, s50, v11
+; GFX900-NEXT:    v_add_f32_e32 v10, s49, v10
+; GFX900-NEXT:    v_add_f32_e32 v9, s48, v9
+; GFX900-NEXT:    v_add_f32_e32 v16, s47, v16
+; GFX900-NEXT:    v_add_f32_e32 v15, s46, v15
+; GFX900-NEXT:    v_add_f32_e32 v14, s45, v14
+; GFX900-NEXT:    v_add_f32_e32 v13, s44, v13
+; GFX900-NEXT:    v_add_f32_e32 v20, s15, v20
+; GFX900-NEXT:    v_add_f32_e32 v19, s14, v19
+; GFX900-NEXT:    v_add_f32_e32 v18, s13, v18
+; GFX900-NEXT:    v_add_f32_e32 v17, s12, v17
+; GFX900-NEXT:    v_add_f32_e32 v24, s11, v24
+; GFX900-NEXT:    v_add_f32_e32 v23, s10, v23
+; GFX900-NEXT:    v_add_f32_e32 v22, s9, v22
+; GFX900-NEXT:    v_add_f32_e32 v21, s8, v21
+; GFX900-NEXT:    v_add_f32_e32 v28, s23, v28
+; GFX900-NEXT:    v_add_f32_e32 v27, s22, v27
+; GFX900-NEXT:    v_add_f32_e32 v26, s21, v26
+; GFX900-NEXT:    v_add_f32_e32 v25, s20, v25
+; GFX900-NEXT:    global_store_dwordx4 v0, v[29:32], s[0:1] offset:96
+; GFX900-NEXT:    global_store_dwordx4 v0, v[25:28], s[0:1] offset:112
+; GFX900-NEXT:    global_store_dwordx4 v0, v[21:24], s[0:1] offset:64
+; GFX900-NEXT:    global_store_dwordx4 v0, v[17:20], s[0:1] offset:80
+; GFX900-NEXT:    global_store_dwordx4 v0, v[13:16], s[0:1] offset:32
+; GFX900-NEXT:    global_store_dwordx4 v0, v[9:12], s[0:1] offset:48
+; GFX900-NEXT:    global_store_dwordx4 v0, v[5:8], s[0:1]
+; GFX900-NEXT:    global_store_dwordx4 v0, v[1:4], s[0:1] offset:16
+; GFX900-NEXT:    s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v32_vs:
+; PACKED-SDAG:       ; %bb.0:
+; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1] offset:16
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1]
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:48
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:32
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:64
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:112
+; PACKED-SDAG-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:96
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-SDAG-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[40:41]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[42:43]
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(6)
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[6:7], v[6:7], s[38:39]
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(5)
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[8:9], v[8:9], s[48:49]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[10:11], v[10:11], s[50:51]
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(4)
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[16:17], v[16:17], s[44:45]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[18:19], v[18:19], s[46:47]
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[28:29], v[28:29], s[16:17]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[30:31], v[30:31], s[18:19]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[20:21], v[20:21], s[12:13]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[22:23], v[22:23], s[14:15]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[14:15], v[14:15], s[10:11]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[24:25], v[24:25], s[20:21]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[26:27], v[26:27], s[22:23]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[4:5], v[4:5], s[36:37]
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[12:13], v[12:13], s[8:9]
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:96
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:112
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:64
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:32
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:48
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1]
+; PACKED-SDAG-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1] offset:16
+; PACKED-SDAG-NEXT:    s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v32_vs:
+; PACKED-GISEL:       ; %bb.0:
+; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v32, 7, v0
+; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[0:3], v32, s[0:1]
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[4:7], v32, s[0:1] offset:16
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[8:11], v32, s[0:1] offset:32
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[12:15], v32, s[0:1] offset:48
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[16:19], v32, s[0:1] offset:64
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[20:23], v32, s[0:1] offset:80
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[24:27], v32, s[0:1] offset:96
+; PACKED-GISEL-NEXT:    global_load_dwordx4 v[28:31], v32, s[0:1] offset:112
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0xa4
+; PACKED-GISEL-NEXT:    s_load_dwordx16 s[8:23], s[4:5], 0xe4
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(7) lgkmcnt(0)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[36:37]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[2:3], v[2:3], s[38:39]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(6)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[4:5], v[4:5], s[40:41]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[6:7], v[6:7], s[42:43]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(5)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[8:9], v[8:9], s[44:45]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[10:11], v[10:11], s[46:47]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(4)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[12:13], v[12:13], s[48:49]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[14:15], v[14:15], s[50:51]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(3)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[16:17], v[16:17], s[8:9]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[18:19], v[18:19], s[10:11]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(2)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[20:21], v[20:21], s[12:13]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[22:23], v[22:23], s[14:15]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(1)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[24:25], v[24:25], s[16:17]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[26:27], v[26:27], s[18:19]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[28:29], v[28:29], s[20:21]
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[30:31], v[30:31], s[22:23]
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; PACKED-GISEL-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; PACKED-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <32 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <32 x float>, ptr addrspace(1) %gep, align 128
@@ -53,13 +286,45 @@ define amdgpu_kernel void @fadd_v32_vs(ptr addrspace(1) %a, <32 x float> %x) {
 }
 
 ; FIXME: GISel does not use op_sel for splat constants.
-
-; GCN-LABEL: {{^}}fadd_v2_v_imm:
-; PACKED:         s_mov_b32 s[[K:[0-9]+]], 0x42c80000
-; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, 0x42c80000, v{{[0-9]+}}
-; PACKED-SDAG:    v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}] op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL:   v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], s[[[K]]:{{[0-9:]+}}]{{$}}
 define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
+; GFX900-LABEL: fadd_v2_v_imm:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v1, 0x42c80000, v1
+; GFX900-NEXT:    v_add_f32_e32 v0, 0x42c80000, v0
+; GFX900-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; GFX900-NEXT:    s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v2_v_imm:
+; PACKED-SDAG:       ; %bb.0:
+; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-SDAG-NEXT:    s_mov_b32 s2, 0x42c80000
+; PACKED-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-SDAG-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-SDAG-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[2:3] op_sel_hi:[1,0]
+; PACKED-SDAG-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-SDAG-NEXT:    s_endpgm
+;
+; PACKED-GISEL-LABEL: fadd_v2_v_imm:
+; PACKED-GISEL:       ; %bb.0:
+; PACKED-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 3, v0
+; PACKED-GISEL-NEXT:    s_mov_b32 s2, 0x42c80000
+; PACKED-GISEL-NEXT:    s_mov_b32 s3, s2
+; PACKED-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; PACKED-GISEL-NEXT:    global_load_dwordx2 v[0:1], v2, s[0:1]
+; PACKED-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; PACKED-GISEL-NEXT:    v_pk_add_f32 v[0:1], v[0:1], s[2:3]
+; PACKED-GISEL-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
+; PACKED-GISEL-NEXT:    s_endpgm
   %id = tail call i32 @llvm.amdgcn.workitem.id.x()
   %gep = getelementptr inbounds <2 x float>, ptr addrspace(1) %a, i32 %id
   %load = load <2 x float>, ptr addrspace(1) %gep, align 8
@@ -68,11 +333,43 @@ define amdgpu_kernel void @fadd_v2_v_imm(ptr addrspace(1) %a) {
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_v2_v_v_splat:
-; GFX900-COUNT-2: v_add_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v0
-; PACKED-SDAG:    v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1] op_sel_hi:[1,0]{{$}}
-; PACKED-GISEL:   v_pk_add_f32 v[{{[0-9:]+}}], v[{{[0-9:]+}}], v[0:1]{{$}}
 define amdgpu_kernel void @fadd_v2_v_v_splat(ptr addrspace(1) %a) {
+; GFX900-LABEL: fadd_v2_v_v_splat:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX900-NEXT:    v_lshlrev_b32_e32 v3, 3, v0
+; GFX900-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX900-NEXT:    global_load_dwordx2 v[1:2], v3, s[0:1]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_add_f32_e32 v2, v2, v0
+; GFX900-NEXT:    v_add_f32_e32 v1, v1, v0
+; GFX900-NEXT:    global_store_dwordx2 v3, v[1:2], s[0:1]
+; GFX900-NEXT:    s_endpgm
+;
+; PACKED-SDAG-LABEL: fadd_v2_v_v_splat:
+; PACKED-SDAG:       ; %bb.0:
+; PACKED-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; PACKED-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; PACKED-SDAG-NEXT:    v_lshlrev_b32_e32 v4, 3, v0
...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/131629


More information about the llvm-commits mailing list