[llvm] [AMDGPU] Autogen checks for mfma-loop.ll (PR #133004)
Jeffrey Byrnes via llvm-commits
llvm-commits at lists.llvm.org
Tue Mar 25 14:51:14 PDT 2025
https://github.com/jrbyrnes created https://github.com/llvm/llvm-project/pull/133004
Needed for a RegisterCoalescing patch
>From 30de950056aacfde20055931f8f9ec304536fa0e Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes at amd.com>
Date: Tue, 25 Mar 2025 14:48:09 -0700
Subject: [PATCH] [AMDGPU] Autogen checks for mfma-loop.ll
Change-Id: I978311320f2f0268d9760cecfcb925e1d12328c9
---
llvm/test/CodeGen/AMDGPU/mfma-loop.ll | 2627 ++++++++++++++++++++++---
1 file changed, 2307 insertions(+), 320 deletions(-)
diff --git a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
index d2835f06d8e0e..d0042bb692402 100644
--- a/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/mfma-loop.ll
@@ -1,28 +1,224 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX908,GFX908_A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX90A,GFX908_A,GFX942_A %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX942,GFX942_A %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX908 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX90A %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx942 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX942 %s
-; GCN-LABEL: {{^}}test_mfma_loop_zeroinit:
-
-; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
; Check that we do not copy agprs to vgprs and back inside the loop.
-
-; GCN: [[LOOP:.LBB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GFX908_A: v_mfma_f32_32x32x1f32
-; GFX942: v_mfma_f32_32x32x1_2b_f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
-
; Final result should be read only once after the loop.
-; GFX908-COUNT-32: v_accvgpr_read_b32
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
-; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-
define amdgpu_kernel void @test_mfma_loop_zeroinit(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_loop_zeroinit:
+; GFX908: ; %bb.0: ; %entry
+; GFX908-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a29, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a28, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a27, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a26, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a25, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a24, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a23, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a22, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a21, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a20, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a19, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a18, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a17, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a16, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a15, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a13, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a12, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a11, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a10, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a9, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a8, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a7, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a6, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a5, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX908-NEXT: s_mov_b32 s0, 16
+; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX908-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX908-NEXT: .LBB0_1: ; %for.cond.preheader
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX908-NEXT: s_add_i32 s0, s0, -1
+; GFX908-NEXT: s_cmp_lg_u32 s0, 0
+; GFX908-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX908-NEXT: ; %bb.2: ; %exit
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_loop_zeroinit:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: .LBB0_1: ; %for.cond.preheader
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX90A-NEXT: ; %bb.2: ; %exit
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_loop_zeroinit:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a26, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a25, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a24, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a23, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a22, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a21, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a20, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a19, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a18, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a17, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a16, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a15, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a14, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a13, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a12, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a11, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a10, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a9, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a8, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a7, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a6, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a5, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: .LBB0_1: ; %for.cond.preheader
+; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: s_cbranch_scc1 .LBB0_1
+; GFX942-NEXT: ; %bb.2: ; %exit
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
entry:
br label %for.cond.preheader
@@ -39,28 +235,226 @@ exit:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_splat:
; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
; 3 vgprs are needed to avoid wait states between writes.
; Check that we do not use 32 temp sgprs as well.
-; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0x42f60000
-; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
-
-; GCN: [[LOOP:.LBB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GFX908_A: v_mfma_f32_32x32x1f32
-; GFX942: v_mfma_f32_32x32x1_2b_f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
-
-; GFX908-COUNT-32: v_accvgpr_read_b32
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
-; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-
define amdgpu_kernel void @test_mfma_loop_unfoldable_splat(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_loop_unfoldable_splat:
+; GFX908: ; %bb.0: ; %entry
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX908-NEXT: s_mov_b32 s0, 16
+; GFX908-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX908-NEXT: .LBB1_1: ; %for.cond.preheader
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX908-NEXT: s_add_i32 s0, s0, -1
+; GFX908-NEXT: s_cmp_lg_u32 s0, 0
+; GFX908-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX908-NEXT: ; %bb.2: ; %exit
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_loop_unfoldable_splat:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX90A-NEXT: ; %bb.2: ; %exit
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_loop_unfoldable_splat:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: .LBB1_1: ; %for.cond.preheader
+; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: s_cbranch_scc1 .LBB1_1
+; GFX942-NEXT: ; %bb.2: ; %exit
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
entry:
br label %for.cond.preheader
@@ -77,23 +471,218 @@ exit:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_loop_non_splat:
-; GCN: v_accvgpr_write_b32 a{{[0-9]+}}, 1.0{{$}}
-; GCN-COUNT-31: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-
-; GCN: [[LOOP:.LBB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GFX908_A: v_mfma_f32_32x32x1f32
-; GFX942: v_mfma_f32_32x32x1_2b_f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
-
-; GFX908-COUNT-32: v_accvgpr_read_b32
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
-; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-
define amdgpu_kernel void @test_mfma_loop_non_splat(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_loop_non_splat:
+; GFX908: ; %bb.0: ; %entry
+; GFX908-NEXT: v_accvgpr_write_b32 a1, 1.0
+; GFX908-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a29, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a28, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a27, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a26, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a25, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a24, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a23, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a22, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a21, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a20, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a19, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a18, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a17, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a16, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a15, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a13, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a12, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a11, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a10, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a9, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a8, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a7, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a6, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a5, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX908-NEXT: s_mov_b32 s0, 16
+; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX908-NEXT: .LBB2_1: ; %for.cond.preheader
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT: s_add_i32 s0, s0, -1
+; GFX908-NEXT: s_cmp_lg_u32 s0, 0
+; GFX908-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX908-NEXT: ; %bb.2: ; %exit
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_loop_non_splat:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, 1.0
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: .LBB2_1: ; %for.cond.preheader
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX90A-NEXT: ; %bb.2: ; %exit
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_loop_non_splat:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, 1.0
+; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a26, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a25, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a24, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a23, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a22, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a21, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a20, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a19, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a18, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a17, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a16, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a15, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a14, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a13, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a12, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a11, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a10, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a9, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a8, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a7, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a6, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a5, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: .LBB2_1: ; %for.cond.preheader
+; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: s_cbranch_scc1 .LBB2_1
+; GFX942-NEXT: ; %bb.2: ; %exit
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
entry:
br label %for.cond.preheader
@@ -110,124 +699,350 @@ exit:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_loop_unfoldable_seq:
-
; Check that we do not use 32 temp vgprs, but rotate 3 vgprs only.
; 3 vgprs are needed to avoid wait states between writes.
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x42f80000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x42fa0000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x42fc0000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x42fe0000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43000000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43010000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43020000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43030000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43040000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43050000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43060000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43070000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43080000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43090000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x430a0000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x430b0000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x430c0000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x430d0000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x430e0000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x430f0000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43100000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43110000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43120000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43130000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43140000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43150000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43160000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43170000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43180000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x43190000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-; GFX-908: v_mov_b32_e32 v0, 0x431a0000
-; GFX-908: s_nop 1
-; GFX-908: v_accvgpr_write_b32 {{[0-9]+}}, v0
-
; FIXME: Constant is now in VGPR instead of SGPR.
-; GFX942_A: v_mov_b32_e32 v{{[0-9]+}}, 0x4{{[0-9a-f]+}}
-; GFX942_A-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v{{[0-9]+}}
-
-; GCN: [[LOOP:.LBB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GFX908_A: v_mfma_f32_32x32x1f32
-; GFX942: v_mfma_f32_32x32x1_2b_f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
-
-; GFX908-COUNT-32: v_accvgpr_read_b32
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
-; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-
define amdgpu_kernel void @test_mfma_loop_unfoldable_seq(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_loop_unfoldable_seq:
+; GFX908: ; %bb.0: ; %entry
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x431a0000
+; GFX908-NEXT: s_mov_b32 s0, 16
+; GFX908-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43190000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43180000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43170000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43160000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43150000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43140000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43130000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43120000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43110000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43100000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x430f0000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x430e0000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x430d0000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x430c0000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x430b0000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x430a0000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43090000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43080000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43070000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43060000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43050000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43040000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43030000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43020000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43010000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x43000000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x42fe0000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x42fc0000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x42fa0000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x42f80000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX908-NEXT: .LBB3_1: ; %for.cond.preheader
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX908-NEXT: s_add_i32 s0, s0, -1
+; GFX908-NEXT: s_cmp_lg_u32 s0, 0
+; GFX908-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX908-NEXT: ; %bb.2: ; %exit
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_loop_unfoldable_seq:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x431a0000
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43190000
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43180000
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43170000
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43160000
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43150000
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43140000
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43130000
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43120000
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43110000
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43100000
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430f0000
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430e0000
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430d0000
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430c0000
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430b0000
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x430a0000
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43090000
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43080000
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43070000
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43060000
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43050000
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43040000
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43030000
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43020000
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43010000
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x43000000
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fe0000
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fc0000
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42fa0000
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f80000
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: .LBB3_1: ; %for.cond.preheader
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX90A-NEXT: ; %bb.2: ; %exit
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_loop_unfoldable_seq:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x431a0000
+; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43190000
+; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43180000
+; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43170000
+; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43160000
+; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43150000
+; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43140000
+; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43130000
+; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43120000
+; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43110000
+; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43100000
+; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x430f0000
+; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x430e0000
+; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x430d0000
+; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x430c0000
+; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x430b0000
+; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x430a0000
+; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43090000
+; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43080000
+; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43070000
+; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43060000
+; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43050000
+; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43040000
+; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43030000
+; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43020000
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43010000
+; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x43000000
+; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fe0000
+; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fc0000
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x42fa0000
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f80000
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX942-NEXT: v_mov_b32_e32 v0, 0x42f60000
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: .LBB3_1: ; %for.cond.preheader
+; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: s_cbranch_scc1 .LBB3_1
+; GFX942-NEXT: ; %bb.2: ; %exit
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
entry:
br label %for.cond.preheader
@@ -244,23 +1059,220 @@ exit:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_loop_vgpr_init:
-
-; GCN-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, v0{{$}}
-
-; GCN: [[LOOP:.LBB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GFX908_A: v_mfma_f32_32x32x1f32
-; GFX942: v_mfma_f32_32x32x1_2b_f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
-
-; GFX908-COUNT-32: v_accvgpr_read_b32
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
-; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-
define amdgpu_kernel void @test_mfma_loop_vgpr_init(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_loop_vgpr_init:
+; GFX908: ; %bb.0: ; %entry
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: s_mov_b32 s0, 16
+; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX908-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX908-NEXT: .LBB4_1: ; %for.cond.preheader
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX908-NEXT: s_add_i32 s0, s0, -1
+; GFX908-NEXT: s_cmp_lg_u32 s0, 0
+; GFX908-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX908-NEXT: ; %bb.2: ; %exit
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_loop_vgpr_init:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: .LBB4_1: ; %for.cond.preheader
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX90A-NEXT: ; %bb.2: ; %exit
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_loop_vgpr_init:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a31, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a30, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a28, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a27, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a25, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a24, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a22, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a21, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a19, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a18, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a16, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a15, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a13, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a12, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a10, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a9, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a4, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a3, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: .LBB4_1: ; %for.cond.preheader
+; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: s_cbranch_scc1 .LBB4_1
+; GFX942-NEXT: ; %bb.2: ; %exit
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%init = bitcast i32 %tid to float
@@ -312,25 +1324,226 @@ exit:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_loop_sgpr_init:
-
-; GFX908: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
-; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
-; GFX942_A-COUNT-32: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], s{{[0-9]+}}
-
-; GCN: [[LOOP:.LBB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GFX908_A: v_mfma_f32_32x32x1f32
-; GFX942: v_mfma_f32_32x32x1_2b_f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
-
-; GFX908-COUNT-32: v_accvgpr_read_b32
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
-; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-
define amdgpu_kernel void @test_mfma_loop_sgpr_init(ptr addrspace(1) %arg, float %init) #0 {
+; GFX908-LABEL: test_mfma_loop_sgpr_init:
+; GFX908: ; %bb.0: ; %entry
+; GFX908-NEXT: s_load_dword s0, s[4:5], 0x2c
+; GFX908-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, s0
+; GFX908-NEXT: s_mov_b32 s0, 16
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX908-NEXT: .LBB5_1: ; %for.cond.preheader
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX908-NEXT: s_add_i32 s0, s0, -1
+; GFX908-NEXT: s_cmp_lg_u32 s0, 0
+; GFX908-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX908-NEXT: ; %bb.2: ; %exit
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_loop_sgpr_init:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, s1
+; GFX90A-NEXT: .LBB5_1: ; %for.cond.preheader
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX90A-NEXT: ; %bb.2: ; %exit
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_loop_sgpr_init:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_accvgpr_write_b32 a31, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a30, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a29, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a28, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a27, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a26, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a25, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a24, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a23, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a22, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a21, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a20, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a19, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a18, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a17, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a16, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a15, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a14, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a13, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a12, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a11, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a10, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a9, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a8, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a7, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a6, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a5, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a4, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a3, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a2, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-NEXT: v_accvgpr_write_b32 a0, s1
+; GFX942-NEXT: .LBB5_1: ; %for.cond.preheader
+; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: s_cbranch_scc1 .LBB5_1
+; GFX942-NEXT: ; %bb.2: ; %exit
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
entry:
%tmp0 = insertelement <32 x float> poison, float %init, i32 0
%tmp1 = insertelement <32 x float> %tmp0, float %init, i32 1
@@ -380,61 +1593,227 @@ exit:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_loop_mixed_init:
-
-; GCN-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, v0
-; GFX908-DAG: v_mov_b32_e32 [[TMP:v[0-9]+]], s{{[0-9]+}}
-; GFX942_A-DAG: s_load_dword [[TMP:s[0-9]+]],
-
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX908-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-
-; GFX90A-DAG: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
-; GFX90A-COUNT-28: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GFX90A-DAG: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP]]
-
-; GCN: [[LOOP:.LBB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GFX908_A: v_mfma_f32_32x32x1f32
-; GFX942: v_mfma_f32_32x32x1_2b_f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
-
-; GFX908-COUNT-32: v_accvgpr_read_b32
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
-; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-
define amdgpu_kernel void @test_mfma_loop_mixed_init(ptr addrspace(1) %arg, float %x) #0 {
+; GFX908-LABEL: test_mfma_loop_mixed_init:
+; GFX908: ; %bb.0: ; %entry
+; GFX908-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX908-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX908-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a29, 0
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: v_mov_b32_e32 v0, s1
+; GFX908-NEXT: v_accvgpr_write_b32 a28, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a27, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a26, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a25, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a24, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a23, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a22, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a21, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a20, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a19, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a18, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a17, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a16, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a15, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a13, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a12, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a11, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a10, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a9, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a8, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a7, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a6, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a5, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v0
+; GFX908-NEXT: s_mov_b32 s0, 16
+; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX908-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX908-NEXT: .LBB6_1: ; %for.cond.preheader
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX908-NEXT: s_add_i32 s0, s0, -1
+; GFX908-NEXT: s_cmp_lg_u32 s0, 0
+; GFX908-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX908-NEXT: ; %bb.2: ; %exit
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_loop_mixed_init:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX90A-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX90A-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a29, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a28, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a27, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a26, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a25, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a24, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a23, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a22, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a21, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a20, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a19, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a18, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a17, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a16, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a15, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a14, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a13, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a12, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a11, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a10, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a9, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a8, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a7, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a6, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a5, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a4, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a3, 0
+; GFX90A-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: .LBB6_1: ; %for.cond.preheader
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX90A-NEXT: ; %bb.2: ; %exit
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_loop_mixed_init:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: s_load_dword s1, s[4:5], 0x2c
+; GFX942-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a0, v0
+; GFX942-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a29, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a28, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a27, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a26, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a25, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a24, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a23, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a22, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a21, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a20, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a19, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a18, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a17, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a16, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a15, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a14, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a13, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a12, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a11, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a10, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a9, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a8, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a7, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a6, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a5, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a4, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a3, 0
+; GFX942-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_accvgpr_write_b32 a1, s1
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: .LBB6_1: ; %for.cond.preheader
+; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: s_cbranch_scc1 .LBB6_1
+; GFX942-NEXT: ; %bb.2: ; %exit
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x()
%init = bitcast i32 %tid to float
@@ -456,29 +1835,157 @@ exit:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_loop_mfma_forward_init:
-
-; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
-; GFX90A-NOT: v_accvgpr
-; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
-; GFX90A-NOT: v_accvgpr
-; GFX942: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
-; GCN-NOT: v_accvgpr
-
-; GCN: [[LOOP:.LBB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GFX908_A: v_mfma_f32_32x32x1f32
-; GFX942: v_mfma_f32_32x32x1_2b_f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
-
-; GFX908-COUNT-32: v_accvgpr_read_b32
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
-; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-
define amdgpu_kernel void @test_mfma_loop_mfma_forward_init(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_loop_mfma_forward_init:
+; GFX908: ; %bb.0: ; %entry
+; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a5, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a6, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a7, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a8, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a9, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a10, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a11, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a12, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a13, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a15, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a16, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a17, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a18, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a19, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a20, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a21, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a22, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a23, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a24, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a25, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a26, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a27, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a28, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a29, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX908-NEXT: s_mov_b32 s0, 16
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT: .LBB7_1: ; %for.cond.preheader
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT: s_add_i32 s0, s0, -1
+; GFX908-NEXT: s_cmp_lg_u32 s0, 0
+; GFX908-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX908-NEXT: ; %bb.2: ; %exit
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_loop_mfma_forward_init:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
+; GFX90A-NEXT: .LBB7_1: ; %for.cond.preheader
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX90A-NEXT: ; %bb.2: ; %exit
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_loop_mfma_forward_init:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0
+; GFX942-NEXT: .LBB7_1: ; %for.cond.preheader
+; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: s_cbranch_scc1 .LBB7_1
+; GFX942-NEXT: ; %bb.2: ; %exit
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
entry:
%mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
@@ -497,33 +2004,295 @@ exit:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_loop_agpr_init:
-
-; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0
-; GFX908: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, a[{{[0-9:]+}}]
-; GFX90A-NOT: v_accvgpr
-; GFX90A: v_mfma_f32_32x32x1f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
-; GFX90A-NOT: v_accvgpr
-; GFX942: v_mfma_f32_32x32x1_2b_f32 a[{{[0-9:]+}}], v{{[0-9]+}}, v{{[0-9]+}}, 0{{$}}
-
; Check that we are using only one tmp VGPR.
-; GFX908: v_accvgpr_read_b32 [[TMP:v[0-9]+]], a{{[0-9]+}}
-; GFX942_A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, a{{[0-9]+}}
-
-; GCN: [[LOOP:.LBB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GFX908_A: v_mfma_f32_32x32x1f32
-; GFX942: v_mfma_f32_32x32x1_2b_f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[LOOP]]
-
-; GFX908-COUNT-32: v_accvgpr_read_b32
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
-; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-
define amdgpu_kernel void @test_mfma_loop_agpr_init(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_loop_agpr_init:
+; GFX908: ; %bb.0: ; %entry
+; GFX908-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX908-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a5, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a6, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a7, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a8, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a9, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a10, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a11, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a12, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a13, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a15, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a16, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a17, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a18, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a19, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a20, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a21, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a22, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a23, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a24, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a25, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a26, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a27, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a28, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a29, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX908-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX908-NEXT: s_mov_b32 s0, 16
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a2, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a3, v33
+; GFX908-NEXT: v_accvgpr_write_b32 a4, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a5, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a6, v33
+; GFX908-NEXT: v_accvgpr_write_b32 a7, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a8, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a9, v33
+; GFX908-NEXT: v_accvgpr_write_b32 a10, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a11, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a12, v33
+; GFX908-NEXT: v_accvgpr_write_b32 a13, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a15, v33
+; GFX908-NEXT: v_accvgpr_write_b32 a16, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a17, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a18, v33
+; GFX908-NEXT: v_accvgpr_write_b32 a19, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a20, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a21, v33
+; GFX908-NEXT: v_accvgpr_write_b32 a22, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a23, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a24, v33
+; GFX908-NEXT: v_accvgpr_write_b32 a25, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a26, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a27, v33
+; GFX908-NEXT: v_accvgpr_write_b32 a28, v2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v33, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a0
+; GFX908-NEXT: v_accvgpr_write_b32 a29, v3
+; GFX908-NEXT: v_accvgpr_write_b32 a30, v33
+; GFX908-NEXT: v_accvgpr_write_b32 a31, v2
+; GFX908-NEXT: .LBB8_1: ; %for.cond.preheader
+; GFX908-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX908-NEXT: s_add_i32 s0, s0, -1
+; GFX908-NEXT: s_cmp_lg_u32 s0, 0
+; GFX908-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX908-NEXT: ; %bb.2: ; %exit
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 5
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_loop_agpr_init:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX90A-NEXT: s_mov_b32 s0, 16
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, 0
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 2
+; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0
+; GFX90A-NEXT: .LBB8_1: ; %for.cond.preheader
+; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v0, v1, a[0:31]
+; GFX90A-NEXT: s_add_i32 s0, s0, -1
+; GFX90A-NEXT: s_cmp_lg_u32 s0, 0
+; GFX90A-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX90A-NEXT: ; %bb.2: ; %exit
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 4
+; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_loop_agpr_init:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: v_mov_b32_e32 v0, 1.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 2.0
+; GFX942-NEXT: s_mov_b32 s0, 16
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, 0
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0
+; GFX942-NEXT: .LBB8_1: ; %for.cond.preheader
+; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX942-NEXT: s_nop 1
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v0, v1, a[0:31]
+; GFX942-NEXT: s_add_i32 s0, s0, -1
+; GFX942-NEXT: s_cmp_lg_u32 s0, 0
+; GFX942-NEXT: s_cbranch_scc1 .LBB8_1
+; GFX942-NEXT: ; %bb.2: ; %exit
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 3
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
entry:
%mai.0 = tail call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float 1.0, float 2.0, <32 x float> zeroinitializer, i32 0, i32 0, i32 0)
%init = extractelement <32 x float> %mai.0, i32 0
@@ -575,33 +2344,251 @@ exit:
ret void
}
-; GCN-LABEL: {{^}}test_mfma_nested_loop_zeroinit:
-
-; GFX908-COUNT-32: v_accvgpr_write_b32 a{{[0-9]+}}, 0{{$}}
-; GFX90A: v_accvgpr_write_b32 [[LEAD:a[0-9]+]], 0
-; GFX90A-COUNT-31: v_accvgpr_mov_b32 a{{[0-9]+}}, [[LEAD]]
-
; Check that we do not copy agprs to vgprs and back in an outer loop.
-
-; GCN: [[OUTER_LOOP:.LBB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GCN: [[INNER_LOOP:.LBB[0-9_]+]]:
-; GCN-NOT: v_accvgpr
-; GFX908_A: v_mfma_f32_32x32x1f32
-; GFX942: v_mfma_f32_32x32x1_2b_f32
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[INNER_LOOP]]
-; GCN-NOT: v_accvgpr
-; GCN: s_cbranch_scc1 [[OUTER_LOOP]]
-
; Final result should be read only once after the loop.
-; GFX908-COUNT-32: v_accvgpr_read_b32
-; GFX90A-NOT: v_accvgpr_read_b32
-; GFX908-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, v[{{[0-9:]+}}]
-; GFX90A-COUNT-8: global_store_dwordx4 v{{[0-9]+}}, a[{{[0-9:]+}}]
-
define amdgpu_kernel void @test_mfma_nested_loop_zeroinit(ptr addrspace(1) %arg) #0 {
+; GFX908-LABEL: test_mfma_nested_loop_zeroinit:
+; GFX908: ; %bb.0: ; %entry
+; GFX908-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a1, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a2, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a3, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a4, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a5, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a6, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a7, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a8, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a9, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a10, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a11, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a12, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a13, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a14, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a15, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a16, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a17, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a18, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a19, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a20, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a21, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a22, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a23, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a24, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a25, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a26, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a27, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a28, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a29, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a30, 0
+; GFX908-NEXT: v_accvgpr_write_b32 a31, 0
+; GFX908-NEXT: s_mov_b32 s0, 0
+; GFX908-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX908-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX908-NEXT: .LBB9_1: ; %for.cond.preheader
+; GFX908-NEXT: ; =>This Loop Header: Depth=1
+; GFX908-NEXT: ; Child Loop BB9_2 Depth 2
+; GFX908-NEXT: s_mov_b32 s1, 16
+; GFX908-NEXT: .LBB9_2: ; %inner.for.cond.preheader
+; GFX908-NEXT: ; Parent Loop BB9_1 Depth=1
+; GFX908-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX908-NEXT: s_nop 0
+; GFX908-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX908-NEXT: s_add_i32 s1, s1, -1
+; GFX908-NEXT: s_cmp_lg_u32 s1, 0
+; GFX908-NEXT: s_cbranch_scc1 .LBB9_2
+; GFX908-NEXT: ; %bb.3: ; %inner.exit
+; GFX908-NEXT: ; in Loop: Header=BB9_1 Depth=1
+; GFX908-NEXT: s_add_i32 s0, s0, 1
+; GFX908-NEXT: s_cmp_lg_u32 s0, 16
+; GFX908-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX908-NEXT: ; %bb.4: ; %exit
+; GFX908-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX908-NEXT: s_nop 7
+; GFX908-NEXT: s_nop 2
+; GFX908-NEXT: v_accvgpr_read_b32 v0, a0
+; GFX908-NEXT: v_accvgpr_read_b32 v28, a28
+; GFX908-NEXT: v_accvgpr_read_b32 v29, a29
+; GFX908-NEXT: v_accvgpr_read_b32 v30, a30
+; GFX908-NEXT: v_accvgpr_read_b32 v31, a31
+; GFX908-NEXT: v_mov_b32_e32 v32, 0
+; GFX908-NEXT: v_accvgpr_read_b32 v1, a1
+; GFX908-NEXT: v_accvgpr_read_b32 v2, a2
+; GFX908-NEXT: v_accvgpr_read_b32 v3, a3
+; GFX908-NEXT: v_accvgpr_read_b32 v4, a4
+; GFX908-NEXT: v_accvgpr_read_b32 v5, a5
+; GFX908-NEXT: v_accvgpr_read_b32 v6, a6
+; GFX908-NEXT: v_accvgpr_read_b32 v7, a7
+; GFX908-NEXT: v_accvgpr_read_b32 v8, a8
+; GFX908-NEXT: v_accvgpr_read_b32 v9, a9
+; GFX908-NEXT: v_accvgpr_read_b32 v10, a10
+; GFX908-NEXT: v_accvgpr_read_b32 v11, a11
+; GFX908-NEXT: v_accvgpr_read_b32 v12, a12
+; GFX908-NEXT: v_accvgpr_read_b32 v13, a13
+; GFX908-NEXT: v_accvgpr_read_b32 v14, a14
+; GFX908-NEXT: v_accvgpr_read_b32 v15, a15
+; GFX908-NEXT: v_accvgpr_read_b32 v16, a16
+; GFX908-NEXT: v_accvgpr_read_b32 v17, a17
+; GFX908-NEXT: v_accvgpr_read_b32 v18, a18
+; GFX908-NEXT: v_accvgpr_read_b32 v19, a19
+; GFX908-NEXT: v_accvgpr_read_b32 v20, a20
+; GFX908-NEXT: v_accvgpr_read_b32 v21, a21
+; GFX908-NEXT: v_accvgpr_read_b32 v22, a22
+; GFX908-NEXT: v_accvgpr_read_b32 v23, a23
+; GFX908-NEXT: v_accvgpr_read_b32 v24, a24
+; GFX908-NEXT: v_accvgpr_read_b32 v25, a25
+; GFX908-NEXT: v_accvgpr_read_b32 v26, a26
+; GFX908-NEXT: v_accvgpr_read_b32 v27, a27
+; GFX908-NEXT: s_waitcnt lgkmcnt(0)
+; GFX908-NEXT: global_store_dwordx4 v32, v[28:31], s[0:1] offset:112
+; GFX908-NEXT: global_store_dwordx4 v32, v[24:27], s[0:1] offset:96
+; GFX908-NEXT: global_store_dwordx4 v32, v[20:23], s[0:1] offset:80
+; GFX908-NEXT: global_store_dwordx4 v32, v[16:19], s[0:1] offset:64
+; GFX908-NEXT: global_store_dwordx4 v32, v[12:15], s[0:1] offset:48
+; GFX908-NEXT: global_store_dwordx4 v32, v[8:11], s[0:1] offset:32
+; GFX908-NEXT: global_store_dwordx4 v32, v[4:7], s[0:1] offset:16
+; GFX908-NEXT: global_store_dwordx4 v32, v[0:3], s[0:1]
+; GFX908-NEXT: s_endpgm
+;
+; GFX90A-LABEL: test_mfma_nested_loop_zeroinit:
+; GFX90A: ; %bb.0: ; %entry
+; GFX90A-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX90A-NEXT: s_mov_b32 s0, 0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a1, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a3, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a4, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a5, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a6, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a7, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a8, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a9, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a10, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a11, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a12, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a13, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a14, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a15, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a16, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a17, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a18, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a19, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a20, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a21, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a22, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a23, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a24, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a25, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a26, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a27, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a28, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a29, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a30, a0
+; GFX90A-NEXT: v_accvgpr_mov_b32 a31, a0
+; GFX90A-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX90A-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX90A-NEXT: .LBB9_1: ; %for.cond.preheader
+; GFX90A-NEXT: ; =>This Loop Header: Depth=1
+; GFX90A-NEXT: ; Child Loop BB9_2 Depth 2
+; GFX90A-NEXT: s_mov_b32 s1, 16
+; GFX90A-NEXT: .LBB9_2: ; %inner.for.cond.preheader
+; GFX90A-NEXT: ; Parent Loop BB9_1 Depth=1
+; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX90A-NEXT: s_nop 0
+; GFX90A-NEXT: v_mfma_f32_32x32x1f32 a[0:31], v1, v0, a[0:31]
+; GFX90A-NEXT: s_add_i32 s1, s1, -1
+; GFX90A-NEXT: s_cmp_lg_u32 s1, 0
+; GFX90A-NEXT: s_cbranch_scc1 .LBB9_2
+; GFX90A-NEXT: ; %bb.3: ; %inner.exit
+; GFX90A-NEXT: ; in Loop: Header=BB9_1 Depth=1
+; GFX90A-NEXT: s_add_i32 s0, s0, 1
+; GFX90A-NEXT: s_cmp_lg_u32 s0, 16
+; GFX90A-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX90A-NEXT: ; %bb.4: ; %exit
+; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: s_nop 7
+; GFX90A-NEXT: s_nop 1
+; GFX90A-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX90A-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX90A-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX90A-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX90A-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX90A-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX90A-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX90A-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX90A-NEXT: s_endpgm
+;
+; GFX942-LABEL: test_mfma_nested_loop_zeroinit:
+; GFX942: ; %bb.0: ; %entry
+; GFX942-NEXT: v_accvgpr_write_b32 a0, 0
+; GFX942-NEXT: s_mov_b32 s0, 0
+; GFX942-NEXT: v_accvgpr_mov_b32 a1, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a2, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a3, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a4, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a5, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a6, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a7, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a8, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a9, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a10, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a11, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a12, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a13, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a14, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a15, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a16, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a17, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a18, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a19, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a20, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a21, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a22, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a23, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a24, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a25, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a26, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a27, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a28, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a29, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a30, a0
+; GFX942-NEXT: v_accvgpr_mov_b32 a31, a0
+; GFX942-NEXT: v_mov_b32_e32 v0, 2.0
+; GFX942-NEXT: v_mov_b32_e32 v1, 1.0
+; GFX942-NEXT: .LBB9_1: ; %for.cond.preheader
+; GFX942-NEXT: ; =>This Loop Header: Depth=1
+; GFX942-NEXT: ; Child Loop BB9_2 Depth 2
+; GFX942-NEXT: s_mov_b32 s1, 16
+; GFX942-NEXT: .LBB9_2: ; %inner.for.cond.preheader
+; GFX942-NEXT: ; Parent Loop BB9_1 Depth=1
+; GFX942-NEXT: ; => This Inner Loop Header: Depth=2
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: v_mfma_f32_32x32x1_2b_f32 a[0:31], v1, v0, a[0:31]
+; GFX942-NEXT: s_add_i32 s1, s1, -1
+; GFX942-NEXT: s_cmp_lg_u32 s1, 0
+; GFX942-NEXT: s_cbranch_scc1 .LBB9_2
+; GFX942-NEXT: ; %bb.3: ; %inner.exit
+; GFX942-NEXT: ; in Loop: Header=BB9_1 Depth=1
+; GFX942-NEXT: s_add_i32 s0, s0, 1
+; GFX942-NEXT: s_cmp_lg_u32 s0, 16
+; GFX942-NEXT: s_cbranch_scc1 .LBB9_1
+; GFX942-NEXT: ; %bb.4: ; %exit
+; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_nop 7
+; GFX942-NEXT: s_nop 0
+; GFX942-NEXT: global_store_dwordx4 v0, a[28:31], s[0:1] offset:112
+; GFX942-NEXT: global_store_dwordx4 v0, a[24:27], s[0:1] offset:96
+; GFX942-NEXT: global_store_dwordx4 v0, a[20:23], s[0:1] offset:80
+; GFX942-NEXT: global_store_dwordx4 v0, a[16:19], s[0:1] offset:64
+; GFX942-NEXT: global_store_dwordx4 v0, a[12:15], s[0:1] offset:48
+; GFX942-NEXT: global_store_dwordx4 v0, a[8:11], s[0:1] offset:32
+; GFX942-NEXT: global_store_dwordx4 v0, a[4:7], s[0:1] offset:16
+; GFX942-NEXT: global_store_dwordx4 v0, a[0:3], s[0:1]
+; GFX942-NEXT: s_endpgm
entry:
br label %for.cond.preheader
More information about the llvm-commits
mailing list