[llvm] [AMDGPU] Autogen checks for mfma-loop.ll (PR #133004)

via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 25 14:51:47 PDT 2025


llvmbot wrote:


<!--LLVM PR SUMMARY COMMENT-->

@llvm/pr-subscribers-backend-amdgpu

Author: Jeffrey Byrnes (jrbyrnes)

<details>
<summary>Changes</summary>

Needed for a RegisterCoalescing patch

---

Patch is 120.17 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/133004.diff


1 Files Affected:

- (modified) llvm/test/CodeGen/AMDGPU/mfma-loop.ll (+2307-320) 


``````````diff
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index d2835f06d8e0e..d0042bb692402 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -1,28 +1,224 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A,GFX942_A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX942,GFX942_A %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
 
-; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
-
-; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
 
 ; Check that we do not copy agprs to vgprs and back inside the loop.
-
-; GCN: [[LOOP:.LBB[0-9_]+]]:
-; GCN-NOT:  v_accvgpr
-; GFX908_A: v_mfma_f32_32x32x1f32
-; GFX942:   v_mfma_f32_32x32x1_2b_f32
-; GCN-NOT:  v_accvgpr
-; GCN:      s_cbranch_scc1 [[LOOP]]
-
 ; Final result should be read only once after the loop.
 
-; GFX908-COUNT-32: v_accvgpr_read_b32
-; GFX90A-NOT:      v_accvgpr_read_b32
-; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
-; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-
 define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_loop_zeroinit:
+; GFX908:       ; %bb.0: ; %entry
+; GFX908-NEXT:    v_accvgpr_write_b32 a31, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a30, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a29, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a28, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a27, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a26, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a25, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a24, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a23, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a22, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a21, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a20, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a19, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a18, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a17, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a16, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a15, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, 0
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, 0
+; GFX908-NEXT:    s_mov_b32 s0, 16
+; GFX908-NEXT:    v_mov_b32_e32 v0, 2.0
+; GFX908-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX908-NEXT:  .LBB0_1: ; %for.cond.preheader
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX908-NEXT:    s_add_i32 s0, s0, -1
+; GFX908-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX908-NEXT:    s_cbranch_scc1 .LBB0_1
+; GFX908-NEXT:  ; %bb.2: ; %exit
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT:    v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT:    v_mov_b32_e32 v32, 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: test_mfma_loop_zeroinit:
+; GFX90A:       ; %bb.0: ; %entry
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, 0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT:    s_mov_b32 s0, 16
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT:  .LBB0_1: ; %for.cond.preheader
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT:    s_add_i32 s0, s0, -1
+; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT:    s_cbranch_scc1 .LBB0_1
+; GFX90A-NEXT:  ; %bb.2: ; %exit
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_loop_zeroinit:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a21, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a20, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a19, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a18, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a17, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a16, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a15, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a14, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a13, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a12, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a11, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a10, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a9, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a8, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a7, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a6, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a5, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a4, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a3, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a2, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a1, 0
+; GFX942-NEXT:    v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT:    s_mov_b32 s0, 16
+; GFX942-NEXT:    v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT:  .LBB0_1: ; %for.cond.preheader
+; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT:    s_nop 1
+; GFX942-NEXT:    v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT:    s_add_i32 s0, s0, -1
+; GFX942-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX942-NEXT:    s_cbranch_scc1 .LBB0_1
+; GFX942-NEXT:  ; %bb.2: ; %exit
+; GFX942-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0
+; GFX942-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX942-NEXT:    s_nop 7
+; GFX942-NEXT:    s_nop 3
+; GFX942-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX942-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX942-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX942-NEXT:    global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX942-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT:    s_endpgm
 entry:
   br label %for.cond.preheader
 
@@ -39,28 +235,226 @@ exit:
   ret void
 }
 
-; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat:
 
 ; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
 ; 3 vgprs are needed to avoid wait states between writes.
 ; Check that we do not use 32 temp sgprs as well.
 
-; GCN:          v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
-; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
-
-; GCN: [[LOOP:.LBB[0-9_]+]]:
-; GCN-NOT:  v_accvgpr
-; GFX908_A: v_mfma_f32_32x32x1f32
-; GFX942:   v_mfma_f32_32x32x1_2b_f32
-; GCN-NOT:  v_accvgpr
-; GCN:      s_cbranch_scc1 [[LOOP]]
-
-; GFX908-COUNT-32: v_accvgpr_read_b32
-; GFX90A-NOT:      v_accvgpr_read_b32
-; GFX908-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
-; GFX90A-COUNT-8:  global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-
 define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_loop_unfoldable_splat:
+; GFX908:       ; %bb.0: ; %entry
+; GFX908-NEXT:    v_mov_b32_e32 v0, 0x42f60000
+; GFX908-NEXT:    s_mov_b32 s0, 16
+; GFX908-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX908-NEXT:    v_accvgpr_write_b32 a31, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a30, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a29, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a28, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a27, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a26, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a25, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a24, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a23, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a22, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a21, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a20, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a19, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a18, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a17, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a16, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a15, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT:    v_mov_b32_e32 v0, 2.0
+; GFX908-NEXT:  .LBB1_1: ; %for.cond.preheader
+; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX908-NEXT:    s_add_i32 s0, s0, -1
+; GFX908-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX908-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX908-NEXT:  ; %bb.2: ; %exit
+; GFX908-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT:    s_nop 7
+; GFX908-NEXT:    s_nop 5
+; GFX908-NEXT:    v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT:    v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT:    v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT:    v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT:    v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT:    v_mov_b32_e32 v32, 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT:    v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT:    v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT:    v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT:    v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT:    v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT:    v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT:    v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT:    v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT:    v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT:    v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT:    v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT:    v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT:    v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT:    v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT:    v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT:    v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT:    v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT:    v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT:    v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT:    v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT:    v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT:    v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT:    v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT:    v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT:    v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT:    v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX908-NEXT:    global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT:    global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT:    global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT:    global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT:    global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT:    global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT:    global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT:    global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT:    s_endpgm
+;
+; GFX90A-LABEL: test_mfma_loop_unfoldable_splat:
+; GFX90A:       ; %bb.0: ; %entry
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0x42f60000
+; GFX90A-NEXT:    v_accvgpr_write_b32 a31, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a30, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a29, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a28, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a27, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a26, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a25, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a24, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a23, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a22, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a21, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a20, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a19, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a18, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a17, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a16, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a15, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a14, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a13, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a12, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a11, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a10, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a9, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a8, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a7, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a6, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a5, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a4, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a3, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a2, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a1, v0
+; GFX90A-NEXT:    v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT:    s_mov_b32 s0, 16
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT:    v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT:  .LBB1_1: ; %for.cond.preheader
+; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT:    s_nop 1
+; GFX90A-NEXT:    v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT:    s_add_i32 s0, s0, -1
+; GFX90A-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT:    s_cbranch_scc1 .LBB1_1
+; GFX90A-NEXT:  ; %bb.2: ; %exit
+; GFX90A-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90A-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT:    s_nop 7
+; GFX90A-NEXT:    s_nop 4
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT:    global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT:    s_endpgm
+;
+; GFX942-LABEL: test_mfma_loop_unfoldable_splat:
+; GFX942:       ; %bb.0: ; %entry
+; GFX942-NEXT:    v_mov_b32_e32 v0, 0x42f60000
+; GFX942-NEXT:    v_accvgpr_write_b32 a31, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a30, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a29, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a28, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a27, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a26, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a25, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a24, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a23, v0
+; GFX942-NEXT:    v_accvgpr_write_b32 a22, v0
+; GFX942-NEXT:    v_...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/133004


More information about the llvm-commits mailing list