[llvm-branch-commits] [llvm] [AMDGPU] Fix async operations in GlobalISel on gfx12-plus (PR #190776)
Sameer Sahasrabuddhe via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Apr 7 04:22:17 PDT 2026
https://github.com/ssahasra created https://github.com/llvm/llvm-project/pull/190776
For GFX1250 async LDS intrinsics, map the LDS pointer operand to VGPR instead of SGPR. These instructions use $vdst/$vdata (VGPROp_32) for the LDS address, unlike the pre-GFX12 variants which use M0 (SGPR).
Assisted-By: Claude Opus 4.6
>From 46633e61ddb4257922577b09e0af5f3a4ad16fa3 Mon Sep 17 00:00:00 2001
From: Sameer Sahasrabuddhe <sameer.sahasrabuddhe at amd.com>
Date: Tue, 7 Apr 2026 16:13:12 +0530
Subject: [PATCH] [AMDGPU] Fix async operations in GlobalISel on gfx12-plus
For GFX1250 async LDS intrinsics, map the LDS pointer operand to VGPR
instead of SGPR. These instructions use $vdst/$vdata (VGPROp_32)
for the LDS address, unlike the pre-GFX12 variants which use M0 (SGPR).
Assisted-By: Claude Opus 4.6
---
.../Target/AMDGPU/AMDGPURegisterBankInfo.cpp | 15 +-
.../CodeGen/AMDGPU/asyncmark-gfx12plus.ll | 539 ++++++++++-----
.../CodeGen/AMDGPU/asyncmark-max-pregfx12.ll | 9 +-
.../test/CodeGen/AMDGPU/asyncmark-pregfx12.ll | 630 ++++++++++++------
4 files changed, 806 insertions(+), 387 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index f14cc8e0446dc..75c4ff2ef28df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -5556,7 +5556,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_cluster_load_async_to_lds_b64:
case Intrinsic::amdgcn_cluster_load_async_to_lds_b128: {
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
- OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ // LDS address goes into $vdst (VGPR).
+ OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
unsigned M0Bank =
getRegBankID(MI.getOperand(5).getReg(), MRI, AMDGPU::SGPRRegBankID);
OpdsMapping[5] = AMDGPU::getValueMapping(M0Bank, 32);
@@ -5569,10 +5570,18 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_global_load_async_to_lds_b8:
case Intrinsic::amdgcn_global_load_async_to_lds_b32:
case Intrinsic::amdgcn_global_load_async_to_lds_b64:
- case Intrinsic::amdgcn_global_load_async_to_lds_b128:
+ case Intrinsic::amdgcn_global_load_async_to_lds_b128: {
+ OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ // LDS address goes into $vdst/$vdata (VGPR).
+ OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+ break;
+ }
case Intrinsic::amdgcn_load_to_lds:
- case Intrinsic::amdgcn_global_load_lds: {
+ case Intrinsic::amdgcn_load_async_to_lds:
+ case Intrinsic::amdgcn_global_load_lds:
+ case Intrinsic::amdgcn_global_load_async_lds: {
OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+ // LDS address goes into M0 (SGPR).
OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
break;
}
diff --git a/llvm/test/CodeGen/AMDGPU/asyncmark-gfx12plus.ll b/llvm/test/CodeGen/AMDGPU/asyncmark-gfx12plus.ll
index cfb296fb2d529..4429468a3e0aa 100644
--- a/llvm/test/CodeGen/AMDGPU/asyncmark-gfx12plus.ll
+++ b/llvm/test/CodeGen/AMDGPU/asyncmark-gfx12plus.ll
@@ -1,48 +1,90 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s -check-prefixes=GFX1250
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s -check-prefixes=SDAG
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck %s -check-prefixes=GISEL
; Test async mark/wait with global_load_lds and global loads
; This version uses wave barriers to enforce program order so that unrelated vmem
; instructions do not get reordered before reaching this point.
define void @interleaved_with_wave_barrier(ptr addrspace(1) %foo, ptr addrspace(3) %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out) {
-; GFX1250-LABEL: interleaved_with_wave_barrier:
-; GFX1250: ; %bb.0: ; %entry
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v9, v4
-; GFX1250-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v6, v5
-; GFX1250-NEXT: v_add_nc_u64_e32 v[4:5], 0x54, v[0:1]
-; GFX1250-NEXT: v_add_nc_u32_e32 v3, 0x54, v2
-; GFX1250-NEXT: global_load_b32 v10, v[8:9], off offset:44
-; GFX1250-NEXT: global_load_b32 v11, v[0:1], off offset:4
-; GFX1250-NEXT: ; wave barrier
-; GFX1250-NEXT: global_load_async_to_lds_b32 v3, v[4:5], off offset:4 th:TH_LOAD_NT nv
-; GFX1250-NEXT: v_add_nc_u64_e32 v[4:5], 0x58, v[8:9]
-; GFX1250-NEXT: v_add_nc_u32_e32 v3, 0x58, v2
-; GFX1250-NEXT: ; wave barrier
-; GFX1250-NEXT: ; asyncmark
-; GFX1250-NEXT: global_load_b32 v0, v[0:1], off offset:8
-; GFX1250-NEXT: ; wave barrier
-; GFX1250-NEXT: global_load_async_to_lds_b32 v3, v[4:5], off offset:4 th:TH_LOAD_LU nv
-; GFX1250-NEXT: ; wave barrier
-; GFX1250-NEXT: global_load_b32 v1, v[8:9], off offset:48
-; GFX1250-NEXT: ; asyncmark
-; GFX1250-NEXT: ; wait_asyncmark(1)
-; GFX1250-NEXT: s_wait_asynccnt 0x1
-; GFX1250-NEXT: ds_load_b32 v3, v2 offset:84
-; GFX1250-NEXT: ; wait_asyncmark(0)
-; GFX1250-NEXT: s_wait_asynccnt 0x0
-; GFX1250-NEXT: ds_load_b32 v2, v2 offset:88
-; GFX1250-NEXT: s_wait_loadcnt 0x2
-; GFX1250-NEXT: v_add_nc_u32_e32 v4, v11, v10
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x101
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX1250-NEXT: v_add3_u32 v0, v4, v3, v0
-; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
-; GFX1250-NEXT: v_add3_u32 v0, v0, v1, v2
-; GFX1250-NEXT: global_store_b32 v[6:7], v0, off
-; GFX1250-NEXT: s_set_pc_i64 s[30:31]
+; SDAG-LABEL: interleaved_with_wave_barrier:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-NEXT: s_wait_kmcnt 0x0
+; SDAG-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v9, v4
+; SDAG-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v6, v5
+; SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 0x54, v[0:1]
+; SDAG-NEXT: v_add_nc_u32_e32 v3, 0x54, v2
+; SDAG-NEXT: global_load_b32 v10, v[8:9], off offset:44
+; SDAG-NEXT: global_load_b32 v11, v[0:1], off offset:4
+; SDAG-NEXT: ; wave barrier
+; SDAG-NEXT: global_load_async_to_lds_b32 v3, v[4:5], off offset:4 th:TH_LOAD_NT nv
+; SDAG-NEXT: v_add_nc_u64_e32 v[4:5], 0x58, v[8:9]
+; SDAG-NEXT: v_add_nc_u32_e32 v3, 0x58, v2
+; SDAG-NEXT: ; wave barrier
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: global_load_b32 v0, v[0:1], off offset:8
+; SDAG-NEXT: ; wave barrier
+; SDAG-NEXT: global_load_async_to_lds_b32 v3, v[4:5], off offset:4 th:TH_LOAD_LU nv
+; SDAG-NEXT: ; wave barrier
+; SDAG-NEXT: global_load_b32 v1, v[8:9], off offset:48
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: ; wait_asyncmark(1)
+; SDAG-NEXT: s_wait_asynccnt 0x1
+; SDAG-NEXT: ds_load_b32 v3, v2 offset:84
+; SDAG-NEXT: ; wait_asyncmark(0)
+; SDAG-NEXT: s_wait_asynccnt 0x0
+; SDAG-NEXT: ds_load_b32 v2, v2 offset:88
+; SDAG-NEXT: s_wait_loadcnt 0x2
+; SDAG-NEXT: v_add_nc_u32_e32 v4, v11, v10
+; SDAG-NEXT: s_wait_loadcnt_dscnt 0x101
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; SDAG-NEXT: v_add3_u32 v0, v4, v3, v0
+; SDAG-NEXT: s_wait_loadcnt_dscnt 0x0
+; SDAG-NEXT: v_add3_u32 v0, v0, v1, v2
+; SDAG-NEXT: global_store_b32 v[6:7], v0, off
+; SDAG-NEXT: s_set_pc_i64 s[30:31]
+;
+; GISEL-LABEL: interleaved_with_wave_barrier:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_dual_mov_b32 v8, v3 :: v_dual_mov_b32 v9, v4
+; GISEL-NEXT: v_dual_mov_b32 v4, v5 :: v_dual_mov_b32 v5, v6
+; GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0x54, v0
+; GISEL-NEXT: v_add_nc_u32_e32 v3, 0x54, v2
+; GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v1, vcc_lo
+; GISEL-NEXT: global_load_b32 v10, v[8:9], off offset:44
+; GISEL-NEXT: global_load_b32 v11, v[0:1], off offset:4
+; GISEL-NEXT: ; wave barrier
+; GISEL-NEXT: global_load_async_to_lds_b32 v3, v[6:7], off offset:4 th:TH_LOAD_NT nv
+; GISEL-NEXT: v_add_co_u32 v6, vcc_lo, 0x58, v8
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_add_co_ci_u32_e64 v7, null, 0, v9, vcc_lo
+; GISEL-NEXT: v_add_nc_u32_e32 v3, 0x58, v2
+; GISEL-NEXT: ; wave barrier
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: global_load_b32 v0, v[0:1], off offset:8
+; GISEL-NEXT: ; wave barrier
+; GISEL-NEXT: global_load_async_to_lds_b32 v3, v[6:7], off offset:4 th:TH_LOAD_LU nv
+; GISEL-NEXT: ; wave barrier
+; GISEL-NEXT: global_load_b32 v1, v[8:9], off offset:48
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: ; wait_asyncmark(1)
+; GISEL-NEXT: s_wait_asynccnt 0x1
+; GISEL-NEXT: ds_load_b32 v3, v2 offset:84
+; GISEL-NEXT: ; wait_asyncmark(0)
+; GISEL-NEXT: s_wait_asynccnt 0x0
+; GISEL-NEXT: ds_load_b32 v2, v2 offset:88
+; GISEL-NEXT: s_wait_loadcnt 0x2
+; GISEL-NEXT: v_add_nc_u32_e32 v6, v11, v10
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x101
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GISEL-NEXT: v_add3_u32 v0, v6, v3, v0
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: v_add3_u32 v0, v0, v1, v2
+; GISEL-NEXT: global_store_b32 v[4:5], v0, off
+; GISEL-NEXT: s_set_pc_i64 s[30:31]
entry:
; First batch: global load, global load, async global-to-LDS
%bar_gep11 = getelementptr i32, ptr addrspace(1) %bar, i32 11
@@ -92,58 +134,116 @@ entry:
; those outstanding operations.
define amdgpu_kernel void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out, i32 %n) {
-; GFX1250-LABEL: test_pipelined_loop:
-; GFX1250: ; %bb.0: ; %prolog
-; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv
-; GFX1250-NEXT: s_load_b32 s3, s[4:5], 0x44 nv
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
-; GFX1250-NEXT: s_add_co_i32 s6, s2, 4
-; GFX1250-NEXT: s_mov_b32 s7, s2
-; GFX1250-NEXT: v_mov_b32_e32 v2, s6
-; GFX1250-NEXT: s_mov_b32 s6, 2
-; GFX1250-NEXT: global_load_async_to_lds_b32 v1, v0, s[0:1] offset:4 nv
-; GFX1250-NEXT: v_mov_b32_e32 v1, 4
-; GFX1250-NEXT: ; asyncmark
-; GFX1250-NEXT: global_load_async_to_lds_b32 v2, v1, s[0:1] offset:4 nv
-; GFX1250-NEXT: v_mov_b32_e32 v1, 0
-; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 8
-; GFX1250-NEXT: ; asyncmark
-; GFX1250-NEXT: .LBB1_1: ; %loop_body
-; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-NEXT: s_add_co_i32 s8, s7, 8
-; GFX1250-NEXT: s_add_co_i32 s6, s6, 1
-; GFX1250-NEXT: v_mov_b32_e32 v2, s8
-; GFX1250-NEXT: global_load_async_to_lds_b32 v2, v0, s[0:1] offset:4 nv
-; GFX1250-NEXT: v_mov_b32_e32 v2, s7
-; GFX1250-NEXT: ; asyncmark
-; GFX1250-NEXT: ; wait_asyncmark(2)
-; GFX1250-NEXT: s_wait_asynccnt 0x2
-; GFX1250-NEXT: s_add_co_i32 s7, s7, 4
-; GFX1250-NEXT: s_cmp_lt_i32 s6, s3
-; GFX1250-NEXT: ds_load_b32 v2, v2
-; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_add_nc_u32_e32 v1, v1, v2
-; GFX1250-NEXT: s_cbranch_scc1 .LBB1_1
-; GFX1250-NEXT: ; %bb.2: ; %epilog
-; GFX1250-NEXT: s_lshl2_add_u32 s0, s3, s2
-; GFX1250-NEXT: ; wait_asyncmark(1)
-; GFX1250-NEXT: s_wait_asynccnt 0x1
-; GFX1250-NEXT: s_add_co_i32 s0, s0, -8
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 nv
-; GFX1250-NEXT: ds_load_b32 v0, v0
-; GFX1250-NEXT: ; wait_asyncmark(0)
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_wait_asynccnt 0x0
-; GFX1250-NEXT: v_add_nc_u32_e32 v0, v1, v0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: global_store_b32 v2, v0, s[0:1]
-; GFX1250-NEXT: s_endpgm
+; SDAG-LABEL: test_pipelined_loop:
+; SDAG: ; %bb.0: ; %prolog
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; SDAG-NEXT: s_clause 0x1
+; SDAG-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv
+; SDAG-NEXT: s_load_b32 s3, s[4:5], 0x44 nv
+; SDAG-NEXT: s_wait_kmcnt 0x0
+; SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2
+; SDAG-NEXT: s_add_co_i32 s6, s2, 4
+; SDAG-NEXT: s_mov_b32 s7, s2
+; SDAG-NEXT: v_mov_b32_e32 v2, s6
+; SDAG-NEXT: s_mov_b32 s6, 2
+; SDAG-NEXT: global_load_async_to_lds_b32 v1, v0, s[0:1] offset:4 nv
+; SDAG-NEXT: v_mov_b32_e32 v1, 4
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: global_load_async_to_lds_b32 v2, v1, s[0:1] offset:4 nv
+; SDAG-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 8
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: .LBB1_1: ; %loop_body
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: s_add_co_i32 s8, s7, 8
+; SDAG-NEXT: s_add_co_i32 s6, s6, 1
+; SDAG-NEXT: v_mov_b32_e32 v2, s8
+; SDAG-NEXT: global_load_async_to_lds_b32 v2, v0, s[0:1] offset:4 nv
+; SDAG-NEXT: v_mov_b32_e32 v2, s7
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: ; wait_asyncmark(2)
+; SDAG-NEXT: s_wait_asynccnt 0x2
+; SDAG-NEXT: s_add_co_i32 s7, s7, 4
+; SDAG-NEXT: s_cmp_lt_i32 s6, s3
+; SDAG-NEXT: ds_load_b32 v2, v2
+; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
+; SDAG-NEXT: s_wait_dscnt 0x0
+; SDAG-NEXT: v_add_nc_u32_e32 v1, v1, v2
+; SDAG-NEXT: s_cbranch_scc1 .LBB1_1
+; SDAG-NEXT: ; %bb.2: ; %epilog
+; SDAG-NEXT: s_lshl2_add_u32 s0, s3, s2
+; SDAG-NEXT: ; wait_asyncmark(1)
+; SDAG-NEXT: s_wait_asynccnt 0x1
+; SDAG-NEXT: s_add_co_i32 s0, s0, -8
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
+; SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 nv
+; SDAG-NEXT: ds_load_b32 v0, v0
+; SDAG-NEXT: ; wait_asyncmark(0)
+; SDAG-NEXT: s_wait_dscnt 0x0
+; SDAG-NEXT: s_wait_asynccnt 0x0
+; SDAG-NEXT: v_add_nc_u32_e32 v0, v1, v0
+; SDAG-NEXT: s_wait_kmcnt 0x0
+; SDAG-NEXT: global_store_b32 v2, v0, s[0:1]
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_pipelined_loop:
+; GISEL: ; %bb.0: ; %prolog
+; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: s_load_b96 s[0:2], s[4:5], 0x24 nv
+; GISEL-NEXT: s_load_b32 s3, s[4:5], 0x44 nv
+; GISEL-NEXT: s_mov_b32 s7, 2
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s7
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_dual_mov_b32 v3, s2 :: v_dual_mov_b32 v0, s2
+; GISEL-NEXT: s_add_co_u32 s6, s2, 4
+; GISEL-NEXT: global_load_async_to_lds_b32 v0, v1, s[0:1] offset:4 nv
+; GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, 4
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: s_mov_b32 s6, 0
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GISEL-NEXT: global_load_async_to_lds_b32 v0, v1, s[0:1] offset:4 nv
+; GISEL-NEXT: s_add_co_u32 s0, s0, 8
+; GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[0:1]
+; GISEL-NEXT: .LBB1_1: ; %loop_body
+; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_dual_add_nc_u32 v5, 8, v3 :: v_dual_add_nc_u32 v4, 1, v4
+; GISEL-NEXT: global_load_async_to_lds_b32 v5, v[0:1], off offset:4 nv
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: ; wait_asyncmark(2)
+; GISEL-NEXT: s_wait_asynccnt 0x2
+; GISEL-NEXT: ds_load_b32 v5, v3
+; GISEL-NEXT: v_add_co_u32 v0, s0, v0, 4
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s0
+; GISEL-NEXT: v_add_nc_u32_e32 v3, 4, v3
+; GISEL-NEXT: v_cmp_gt_i32_e32 vcc_lo, s3, v4
+; GISEL-NEXT: s_wait_dscnt 0x0
+; GISEL-NEXT: v_add_nc_u32_e32 v2, v2, v5
+; GISEL-NEXT: s_cbranch_vccnz .LBB1_1
+; GISEL-NEXT: ; %bb.2: ; %epilog
+; GISEL-NEXT: s_lshl_b32 s0, s3, 2
+; GISEL-NEXT: ; wait_asyncmark(1)
+; GISEL-NEXT: s_wait_asynccnt 0x1
+; GISEL-NEXT: s_add_co_u32 s0, s2, s0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_add_co_u32 s0, s0, -8
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 nv
+; GISEL-NEXT: ds_load_b32 v0, v0
+; GISEL-NEXT: ; wait_asyncmark(0)
+; GISEL-NEXT: s_wait_dscnt 0x0
+; GISEL-NEXT: s_wait_asynccnt 0x0
+; GISEL-NEXT: v_add_nc_u32_e32 v0, v2, v0
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: global_store_b32 v1, v0, s[0:1]
+; GISEL-NEXT: s_endpgm
prolog:
; Load first iteration
call void @llvm.amdgcn.global.load.async.to.lds.b32(ptr addrspace(1) %foo, ptr addrspace(3) %lds, i32 4, i32 u0x20)
@@ -201,90 +301,181 @@ epilog:
; Software pipelined loop with async global-to-LDS and global loads
define amdgpu_kernel void @test_pipelined_loop_with_global(ptr addrspace(1) %foo, ptr addrspace(3) %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out, i32 %n) {
-; GFX1250-LABEL: test_pipelined_loop_with_global:
-; GFX1250: ; %bb.0: ; %prolog
-; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1
-; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: s_load_b96 s[8:10], s[4:5], 0x24 nv
-; GFX1250-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv
-; GFX1250-NEXT: v_mov_b32_e32 v0, 0
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: s_load_b32 s6, s[8:9], 0x0
-; GFX1250-NEXT: s_load_b32 s7, s[0:1], 0x0
-; GFX1250-NEXT: v_mov_b32_e32 v1, s10
-; GFX1250-NEXT: s_add_co_i32 s11, s10, 4
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_mov_b32 v4, s11
-; GFX1250-NEXT: s_load_b32 s11, s[4:5], 0x44 nv
-; GFX1250-NEXT: global_load_async_to_lds_b32 v1, v0, s[8:9] offset:4 nv
-; GFX1250-NEXT: ; asyncmark
-; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_load_b32 v1, v0, s[8:9] offset:4
-; GFX1250-NEXT: global_load_b32 v2, v0, s[0:1] offset:4
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 8
-; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[8:9], 8
-; GFX1250-NEXT: s_wait_kmcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s7
-; GFX1250-NEXT: s_mov_b64 s[6:7], s[2:3]
-; GFX1250-NEXT: global_load_async_to_lds_b32 v4, v3, s[8:9] offset:4 nv
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
-; GFX1250-NEXT: s_mov_b32 s8, 2
-; GFX1250-NEXT: s_mov_b32 s9, s10
-; GFX1250-NEXT: ; asyncmark
-; GFX1250-NEXT: .LBB2_1: ; %loop_body
-; GFX1250-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX1250-NEXT: s_add_co_i32 s12, s9, 8
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v9, s12
-; GFX1250-NEXT: v_mov_b32_e32 v8, v3
-; GFX1250-NEXT: s_clause 0x1
-; GFX1250-NEXT: global_load_b32 v3, v0, s[4:5]
-; GFX1250-NEXT: global_load_b32 v4, v0, s[0:1]
-; GFX1250-NEXT: v_dual_add_nc_u32 v10, v5, v6 :: v_dual_mov_b32 v6, v2
-; GFX1250-NEXT: global_load_async_to_lds_b32 v9, v0, s[4:5] offset:4 nv
-; GFX1250-NEXT: v_mov_b32_e32 v9, s9
-; GFX1250-NEXT: ; asyncmark
-; GFX1250-NEXT: ; wait_asyncmark(2)
-; GFX1250-NEXT: s_wait_asynccnt 0x2
-; GFX1250-NEXT: s_wait_asynccnt 0x2
-; GFX1250-NEXT: s_add_co_i32 s8, s8, 1
-; GFX1250-NEXT: s_add_co_i32 s9, s9, 4
-; GFX1250-NEXT: ds_load_b32 v9, v9
-; GFX1250-NEXT: v_mov_b32_e32 v5, v1
-; GFX1250-NEXT: s_cmp_lt_i32 s8, s11
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
-; GFX1250-NEXT: s_add_nc_u64 s[4:5], s[4:5], 4
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_add_nc_u32_e32 v9, v10, v9
-; GFX1250-NEXT: global_store_b32 v0, v9, s[6:7]
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: s_add_nc_u64 s[6:7], s[6:7], 4
-; GFX1250-NEXT: s_cbranch_scc1 .LBB2_1
-; GFX1250-NEXT: ; %bb.2: ; %epilog
-; GFX1250-NEXT: s_add_co_i32 s0, s11, -2
-; GFX1250-NEXT: ; wait_asyncmark(1)
-; GFX1250-NEXT: s_wait_asynccnt 0x1
-; GFX1250-NEXT: s_lshl2_add_u32 s1, s0, s10
-; GFX1250-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1250-NEXT: v_dual_add_nc_u32 v2, v8, v7 :: v_dual_mov_b32 v0, s1
-; GFX1250-NEXT: ds_load_b32 v1, v0
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_add_nc_u32 v1, v2, v1
-; GFX1250-NEXT: global_store_b32 v5, v1, s[2:3] scale_offset
-; GFX1250-NEXT: ; wait_asyncmark(0)
-; GFX1250-NEXT: s_wait_asynccnt 0x0
-; GFX1250-NEXT: ds_load_b32 v0, v0 offset:4
-; GFX1250-NEXT: s_wait_loadcnt 0x0
-; GFX1250-NEXT: s_wait_xcnt 0x0
-; GFX1250-NEXT: v_add_nc_u32_e32 v1, v3, v4
-; GFX1250-NEXT: s_wait_dscnt 0x0
-; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1250-NEXT: v_add_nc_u32_e32 v0, v1, v0
-; GFX1250-NEXT: global_store_b32 v5, v0, s[2:3] offset:4 scale_offset
-; GFX1250-NEXT: s_endpgm
+; SDAG-LABEL: test_pipelined_loop_with_global:
+; SDAG: ; %bb.0: ; %prolog
+; SDAG-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; SDAG-NEXT: s_clause 0x1
+; SDAG-NEXT: s_load_b96 s[8:10], s[4:5], 0x24 nv
+; SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: s_wait_kmcnt 0x0
+; SDAG-NEXT: s_load_b32 s6, s[8:9], 0x0
+; SDAG-NEXT: s_load_b32 s7, s[0:1], 0x0
+; SDAG-NEXT: v_mov_b32_e32 v1, s10
+; SDAG-NEXT: s_add_co_i32 s11, s10, 4
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_mov_b32 v4, s11
+; SDAG-NEXT: s_load_b32 s11, s[4:5], 0x44 nv
+; SDAG-NEXT: global_load_async_to_lds_b32 v1, v0, s[8:9] offset:4 nv
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: s_clause 0x1
+; SDAG-NEXT: global_load_b32 v1, v0, s[8:9] offset:4
+; SDAG-NEXT: global_load_b32 v2, v0, s[0:1] offset:4
+; SDAG-NEXT: s_wait_xcnt 0x0
+; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 8
+; SDAG-NEXT: s_add_nc_u64 s[4:5], s[8:9], 8
+; SDAG-NEXT: s_wait_kmcnt 0x0
+; SDAG-NEXT: v_dual_mov_b32 v5, s6 :: v_dual_mov_b32 v6, s7
+; SDAG-NEXT: s_mov_b64 s[6:7], s[2:3]
+; SDAG-NEXT: global_load_async_to_lds_b32 v4, v3, s[8:9] offset:4 nv
+; SDAG-NEXT: s_wait_loadcnt 0x0
+; SDAG-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v4, v2
+; SDAG-NEXT: s_mov_b32 s8, 2
+; SDAG-NEXT: s_mov_b32 s9, s10
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: .LBB2_1: ; %loop_body
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: s_add_co_i32 s12, s9, 8
+; SDAG-NEXT: s_wait_loadcnt 0x0
+; SDAG-NEXT: v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v9, s12
+; SDAG-NEXT: v_mov_b32_e32 v8, v3
+; SDAG-NEXT: s_clause 0x1
+; SDAG-NEXT: global_load_b32 v3, v0, s[4:5]
+; SDAG-NEXT: global_load_b32 v4, v0, s[0:1]
+; SDAG-NEXT: v_dual_add_nc_u32 v10, v5, v6 :: v_dual_mov_b32 v6, v2
+; SDAG-NEXT: global_load_async_to_lds_b32 v9, v0, s[4:5] offset:4 nv
+; SDAG-NEXT: v_mov_b32_e32 v9, s9
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: ; wait_asyncmark(2)
+; SDAG-NEXT: s_wait_asynccnt 0x2
+; SDAG-NEXT: s_wait_asynccnt 0x2
+; SDAG-NEXT: s_add_co_i32 s8, s8, 1
+; SDAG-NEXT: s_add_co_i32 s9, s9, 4
+; SDAG-NEXT: ds_load_b32 v9, v9
+; SDAG-NEXT: v_mov_b32_e32 v5, v1
+; SDAG-NEXT: s_cmp_lt_i32 s8, s11
+; SDAG-NEXT: s_wait_xcnt 0x0
+; SDAG-NEXT: s_add_nc_u64 s[0:1], s[0:1], 4
+; SDAG-NEXT: s_add_nc_u64 s[4:5], s[4:5], 4
+; SDAG-NEXT: s_wait_dscnt 0x0
+; SDAG-NEXT: v_add_nc_u32_e32 v9, v10, v9
+; SDAG-NEXT: global_store_b32 v0, v9, s[6:7]
+; SDAG-NEXT: s_wait_xcnt 0x0
+; SDAG-NEXT: s_add_nc_u64 s[6:7], s[6:7], 4
+; SDAG-NEXT: s_cbranch_scc1 .LBB2_1
+; SDAG-NEXT: ; %bb.2: ; %epilog
+; SDAG-NEXT: s_add_co_i32 s0, s11, -2
+; SDAG-NEXT: ; wait_asyncmark(1)
+; SDAG-NEXT: s_wait_asynccnt 0x1
+; SDAG-NEXT: s_lshl2_add_u32 s1, s0, s10
+; SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; SDAG-NEXT: v_dual_add_nc_u32 v2, v8, v7 :: v_dual_mov_b32 v0, s1
+; SDAG-NEXT: ds_load_b32 v1, v0
+; SDAG-NEXT: s_wait_dscnt 0x0
+; SDAG-NEXT: v_dual_mov_b32 v5, s0 :: v_dual_add_nc_u32 v1, v2, v1
+; SDAG-NEXT: global_store_b32 v5, v1, s[2:3] scale_offset
+; SDAG-NEXT: ; wait_asyncmark(0)
+; SDAG-NEXT: s_wait_asynccnt 0x0
+; SDAG-NEXT: ds_load_b32 v0, v0 offset:4
+; SDAG-NEXT: s_wait_loadcnt 0x0
+; SDAG-NEXT: s_wait_xcnt 0x0
+; SDAG-NEXT: v_add_nc_u32_e32 v1, v3, v4
+; SDAG-NEXT: s_wait_dscnt 0x0
+; SDAG-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT: v_add_nc_u32_e32 v0, v1, v0
+; SDAG-NEXT: global_store_b32 v5, v0, s[2:3] offset:4 scale_offset
+; SDAG-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_pipelined_loop_with_global:
+; GISEL: ; %bb.0: ; %prolog
+; GISEL-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; msbs: dst=0 src0=0 src1=0 src2=0
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: s_load_b96 s[8:10], s[4:5], 0x24 nv
+; GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x34 nv
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: s_wait_xcnt 0x0
+; GISEL-NEXT: s_load_b32 s4, s[4:5], 0x44 nv
+; GISEL-NEXT: s_wait_xcnt 0x0
+; GISEL-NEXT: s_mov_b32 s5, 2
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: s_load_b32 s11, s[8:9], 0x0
+; GISEL-NEXT: s_load_b32 s12, s[0:1], 0x0
+; GISEL-NEXT: v_mov_b32_e32 v0, s10
+; GISEL-NEXT: s_add_co_u32 s6, s10, 4
+; GISEL-NEXT: v_dual_mov_b32 v9, s10 :: v_dual_mov_b32 v11, s5
+; GISEL-NEXT: global_load_async_to_lds_b32 v0, v1, s[8:9] offset:4 nv
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: s_clause 0x1
+; GISEL-NEXT: global_load_b32 v6, v1, s[8:9] offset:4
+; GISEL-NEXT: global_load_b32 v7, v1, s[0:1] offset:4
+; GISEL-NEXT: s_wait_xcnt 0x0
+; GISEL-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, 4
+; GISEL-NEXT: s_add_co_u32 s0, s0, 8
+; GISEL-NEXT: s_add_co_ci_u32 s1, s1, 0
+; GISEL-NEXT: s_add_co_u32 s6, s8, 8
+; GISEL-NEXT: s_add_co_ci_u32 s7, s9, 0
+; GISEL-NEXT: v_mov_b64_e32 v[4:5], s[0:1]
+; GISEL-NEXT: v_mov_b64_e32 v[2:3], s[6:7]
+; GISEL-NEXT: s_wait_kmcnt 0x0
+; GISEL-NEXT: v_dual_mov_b32 v13, s11 :: v_dual_mov_b32 v15, s12
+; GISEL-NEXT: global_load_async_to_lds_b32 v0, v1, s[8:9] offset:4 nv
+; GISEL-NEXT: v_mov_b64_e32 v[0:1], s[2:3]
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: v_dual_mov_b32 v8, v6 :: v_dual_mov_b32 v12, v7
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: .LBB2_1: ; %loop_body
+; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: s_wait_loadcnt 0x1
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_dual_mov_b32 v10, v12 :: v_dual_add_nc_u32 v16, 8, v9
+; GISEL-NEXT: s_wait_loadcnt 0x0
+; GISEL-NEXT: v_dual_mov_b32 v14, v8 :: v_dual_add_nc_u32 v11, 1, v11
+; GISEL-NEXT: global_load_b32 v12, v[4:5], off
+; GISEL-NEXT: global_load_b32 v8, v[2:3], off
+; GISEL-NEXT: s_wait_xcnt 0x1
+; GISEL-NEXT: v_add_co_u32 v4, vcc_lo, v4, 4
+; GISEL-NEXT: global_load_async_to_lds_b32 v16, v[2:3], off offset:4 nv
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: ; wait_asyncmark(2)
+; GISEL-NEXT: s_wait_asynccnt 0x2
+; GISEL-NEXT: s_wait_asynccnt 0x2
+; GISEL-NEXT: ds_load_b32 v16, v9
+; GISEL-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo
+; GISEL-NEXT: s_wait_xcnt 0x0
+; GISEL-NEXT: v_add_co_u32 v2, vcc_lo, v2, 4
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_add_co_ci_u32_e64 v3, null, 0, v3, vcc_lo
+; GISEL-NEXT: v_add_nc_u32_e32 v9, 4, v9
+; GISEL-NEXT: v_cmp_gt_i32_e32 vcc_lo, s4, v11
+; GISEL-NEXT: s_wait_dscnt 0x0
+; GISEL-NEXT: v_add3_u32 v16, v13, v15, v16
+; GISEL-NEXT: v_dual_mov_b32 v13, v6 :: v_dual_mov_b32 v15, v7
+; GISEL-NEXT: global_store_b32 v[0:1], v16, off
+; GISEL-NEXT: s_wait_xcnt 0x0
+; GISEL-NEXT: v_add_co_u32 v0, s0, v0, 4
+; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, s0
+; GISEL-NEXT: s_cbranch_vccnz .LBB2_1
+; GISEL-NEXT: ; %bb.2: ; %epilog
+; GISEL-NEXT: s_add_co_i32 s0, s4, -2
+; GISEL-NEXT: ; wait_asyncmark(1)
+; GISEL-NEXT: s_wait_asynccnt 0x1
+; GISEL-NEXT: s_lshl_b32 s1, s0, 2
+; GISEL-NEXT: v_mov_b32_e32 v2, s0
+; GISEL-NEXT: s_add_co_u32 s1, s10, s1
+; GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-NEXT: v_mov_b32_e32 v0, s1
+; GISEL-NEXT: ds_load_b32 v1, v0
+; GISEL-NEXT: s_wait_dscnt 0x0
+; GISEL-NEXT: v_add3_u32 v1, v14, v10, v1
+; GISEL-NEXT: global_store_b32 v2, v1, s[2:3] scale_offset
+; GISEL-NEXT: ; wait_asyncmark(0)
+; GISEL-NEXT: s_wait_asynccnt 0x0
+; GISEL-NEXT: ds_load_b32 v0, v0 offset:4
+; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0
+; GISEL-NEXT: v_add3_u32 v0, v8, v12, v0
+; GISEL-NEXT: global_store_b32 v2, v0, s[2:3] offset:4 scale_offset
+; GISEL-NEXT: s_endpgm
prolog:
; Load first iteration
%v0 = load i32, ptr addrspace(1) %foo
diff --git a/llvm/test/CodeGen/AMDGPU/asyncmark-max-pregfx12.ll b/llvm/test/CodeGen/AMDGPU/asyncmark-max-pregfx12.ll
index 4fe243ba56b10..ee4f1b3982378 100644
--- a/llvm/test/CodeGen/AMDGPU/asyncmark-max-pregfx12.ll
+++ b/llvm/test/CodeGen/AMDGPU/asyncmark-max-pregfx12.ll
@@ -1,6 +1,9 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx942 < %s | FileCheck %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1010 < %s | FileCheck %s
; Loop body exceeds MaxAsyncMarkers on first iteration
; Preloop: 5 markers
diff --git a/llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll b/llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll
index b8ec819f8e310..b32e883b2b535 100644
--- a/llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll
+++ b/llvm/test/CodeGen/AMDGPU/asyncmark-pregfx12.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=WITHASYNC
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=SDAG
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s -check-prefixes=GISEL
; Demonstrate that wait.asyncmark is a code motion barrier for loads from LDS.
; This is the simplest demo possible. We don't actually use async ops, but just
@@ -7,18 +8,30 @@
; coalesced into a wider LDS load.
define void @code_barrier(ptr addrspace(1) %foo, ptr addrspace(3) %lds, ptr addrspace(3) %out) {
-; WITHASYNC-LABEL: code_barrier:
-; WITHASYNC: ; %bb.0:
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; WITHASYNC-NEXT: ds_read_b32 v0, v2
-; WITHASYNC-NEXT: ; wait_asyncmark(0)
-; WITHASYNC-NEXT: ds_read_b32 v1, v2 offset:4
-; WITHASYNC-NEXT: s_waitcnt lgkmcnt(0)
-; WITHASYNC-NEXT: v_add_u32_e32 v0, v0, v1
-; WITHASYNC-NEXT: ds_write_b32 v3, v0
-; WITHASYNC-NEXT: s_waitcnt lgkmcnt(0)
-; WITHASYNC-NEXT: s_setpc_b64 s[30:31]
;
+; SDAG-LABEL: code_barrier:
+; SDAG: ; %bb.0:
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: ds_read_b32 v0, v2
+; SDAG-NEXT: ; wait_asyncmark(0)
+; SDAG-NEXT: ds_read_b32 v1, v2 offset:4
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_add_u32_e32 v0, v0, v1
+; SDAG-NEXT: ds_write_b32 v3, v0
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: code_barrier:
+; GISEL: ; %bb.0:
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: ds_read_b32 v0, v2
+; GISEL-NEXT: ; wait_asyncmark(0)
+; GISEL-NEXT: ds_read_b32 v1, v2 offset:4
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_add_u32_e32 v0, v0, v1
+; GISEL-NEXT: ds_write_b32 v3, v0
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%lds_gep1 = getelementptr i32, ptr addrspace(3) %lds, i32 1
%val1 = load i32, ptr addrspace(3) %lds
call void @llvm.amdgcn.wait.asyncmark(i16 0)
@@ -34,39 +47,72 @@ define void @code_barrier(ptr addrspace(1) %foo, ptr addrspace(3) %lds, ptr addr
; instructions do not get reordered before reaching this point.
define void @interleaved_global_and_dma(ptr addrspace(1) %foo, ptr addrspace(3) %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out) {
-; WITHASYNC-LABEL: interleaved_global_and_dma:
-; WITHASYNC: ; %bb.0: ; %entry
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; WITHASYNC-NEXT: v_readfirstlane_b32 s4, v2
-; WITHASYNC-NEXT: global_load_dword v7, v[3:4], off
-; WITHASYNC-NEXT: global_load_dword v8, v[0:1], off
-; WITHASYNC-NEXT: s_mov_b32 m0, s4
-; WITHASYNC-NEXT: ; wave barrier
-; WITHASYNC-NEXT: s_nop 0
-; WITHASYNC-NEXT: global_load_dword v[3:4], off lds
-; WITHASYNC-NEXT: ; asyncmark
-; WITHASYNC-NEXT: global_load_dword v0, v[0:1], off
-; WITHASYNC-NEXT: ; wave barrier
-; WITHASYNC-NEXT: s_nop 0
-; WITHASYNC-NEXT: global_load_dword v[3:4], off lds
-; WITHASYNC-NEXT: ; wave barrier
-; WITHASYNC-NEXT: global_load_dword v1, v[3:4], off
-; WITHASYNC-NEXT: ; asyncmark
-; WITHASYNC-NEXT: ; wait_asyncmark(1)
-; WITHASYNC-NEXT: s_waitcnt vmcnt(3)
-; WITHASYNC-NEXT: ds_read_b32 v3, v2
-; WITHASYNC-NEXT: ; wait_asyncmark(0)
-; WITHASYNC-NEXT: s_waitcnt vmcnt(1)
-; WITHASYNC-NEXT: ds_read_b32 v2, v2
-; WITHASYNC-NEXT: v_add_u32_e32 v4, v8, v7
-; WITHASYNC-NEXT: s_waitcnt lgkmcnt(1)
-; WITHASYNC-NEXT: v_add3_u32 v0, v4, v3, v0
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; WITHASYNC-NEXT: v_add3_u32 v0, v0, v1, v2
-; WITHASYNC-NEXT: global_store_dword v[5:6], v0, off
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0)
-; WITHASYNC-NEXT: s_setpc_b64 s[30:31]
;
+; SDAG-LABEL: interleaved_global_and_dma:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; SDAG-NEXT: global_load_dword v7, v[3:4], off
+; SDAG-NEXT: global_load_dword v8, v[0:1], off
+; SDAG-NEXT: s_mov_b32 m0, s4
+; SDAG-NEXT: ; wave barrier
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: global_load_dword v[3:4], off lds
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: global_load_dword v0, v[0:1], off
+; SDAG-NEXT: ; wave barrier
+; SDAG-NEXT: s_nop 0
+; SDAG-NEXT: global_load_dword v[3:4], off lds
+; SDAG-NEXT: ; wave barrier
+; SDAG-NEXT: global_load_dword v1, v[3:4], off
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: ; wait_asyncmark(1)
+; SDAG-NEXT: s_waitcnt vmcnt(3)
+; SDAG-NEXT: ds_read_b32 v3, v2
+; SDAG-NEXT: ; wait_asyncmark(0)
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: ds_read_b32 v2, v2
+; SDAG-NEXT: v_add_u32_e32 v4, v8, v7
+; SDAG-NEXT: s_waitcnt lgkmcnt(1)
+; SDAG-NEXT: v_add3_u32 v0, v4, v3, v0
+; SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_add3_u32 v0, v0, v1, v2
+; SDAG-NEXT: global_store_dword v[5:6], v0, off
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: interleaved_global_and_dma:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GISEL-NEXT: global_load_dword v7, v[3:4], off
+; GISEL-NEXT: global_load_dword v8, v[0:1], off
+; GISEL-NEXT: s_mov_b32 m0, s4
+; GISEL-NEXT: ; wave barrier
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: global_load_dword v[3:4], off lds
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: global_load_dword v0, v[0:1], off
+; GISEL-NEXT: ; wave barrier
+; GISEL-NEXT: s_nop 0
+; GISEL-NEXT: global_load_dword v[3:4], off lds
+; GISEL-NEXT: ; wave barrier
+; GISEL-NEXT: global_load_dword v1, v[3:4], off
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: ; wait_asyncmark(1)
+; GISEL-NEXT: s_waitcnt vmcnt(3)
+; GISEL-NEXT: ds_read_b32 v3, v2
+; GISEL-NEXT: ; wait_asyncmark(0)
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: ds_read_b32 v2, v2
+; GISEL-NEXT: v_add_u32_e32 v4, v8, v7
+; GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GISEL-NEXT: v_add3_u32 v0, v4, v3, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_add3_u32 v0, v0, v1, v2
+; GISEL-NEXT: global_store_dword v[5:6], v0, off
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
; First batch: global load, global load, async global-to-LDS
%bar_v11 = load i32, ptr addrspace(1) %bar
@@ -104,39 +150,72 @@ entry:
}
define void @interleaved_buffer_and_dma(ptr addrspace(8) inreg %buf, ptr addrspace(1) %foo, ptr addrspace(3) inreg %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out) {
-; WITHASYNC-LABEL: interleaved_buffer_and_dma:
-; WITHASYNC: ; %bb.0: ; %entry
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; WITHASYNC-NEXT: s_mov_b32 m0, s20
-; WITHASYNC-NEXT: global_load_dword v6, v[2:3], off
-; WITHASYNC-NEXT: global_load_dword v7, v[0:1], off
-; WITHASYNC-NEXT: v_mov_b32_e32 v8, 0x54
-; WITHASYNC-NEXT: ; wave barrier
-; WITHASYNC-NEXT: buffer_load_dword v8, s[16:19], 0 offen lds
-; WITHASYNC-NEXT: ; asyncmark
-; WITHASYNC-NEXT: global_load_dword v0, v[0:1], off
-; WITHASYNC-NEXT: v_mov_b32_e32 v1, 0x58
-; WITHASYNC-NEXT: ; wave barrier
-; WITHASYNC-NEXT: buffer_load_dword v1, s[16:19], 0 offen lds
-; WITHASYNC-NEXT: ; wave barrier
-; WITHASYNC-NEXT: global_load_dword v1, v[2:3], off
-; WITHASYNC-NEXT: v_mov_b32_e32 v2, s20
-; WITHASYNC-NEXT: ; asyncmark
-; WITHASYNC-NEXT: ; wait_asyncmark(1)
-; WITHASYNC-NEXT: s_waitcnt vmcnt(3)
-; WITHASYNC-NEXT: ds_read_b32 v3, v2
-; WITHASYNC-NEXT: ; wait_asyncmark(0)
-; WITHASYNC-NEXT: s_waitcnt vmcnt(1)
-; WITHASYNC-NEXT: ds_read_b32 v2, v2
-; WITHASYNC-NEXT: v_add_u32_e32 v6, v7, v6
-; WITHASYNC-NEXT: s_waitcnt lgkmcnt(1)
-; WITHASYNC-NEXT: v_add3_u32 v0, v6, v3, v0
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; WITHASYNC-NEXT: v_add3_u32 v0, v0, v1, v2
-; WITHASYNC-NEXT: global_store_dword v[4:5], v0, off
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0)
-; WITHASYNC-NEXT: s_setpc_b64 s[30:31]
;
+; SDAG-LABEL: interleaved_buffer_and_dma:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 m0, s20
+; SDAG-NEXT: global_load_dword v6, v[2:3], off
+; SDAG-NEXT: global_load_dword v7, v[0:1], off
+; SDAG-NEXT: v_mov_b32_e32 v8, 0x54
+; SDAG-NEXT: ; wave barrier
+; SDAG-NEXT: buffer_load_dword v8, s[16:19], 0 offen lds
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: global_load_dword v0, v[0:1], off
+; SDAG-NEXT: v_mov_b32_e32 v1, 0x58
+; SDAG-NEXT: ; wave barrier
+; SDAG-NEXT: buffer_load_dword v1, s[16:19], 0 offen lds
+; SDAG-NEXT: ; wave barrier
+; SDAG-NEXT: global_load_dword v1, v[2:3], off
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: ; wait_asyncmark(1)
+; SDAG-NEXT: s_waitcnt vmcnt(3)
+; SDAG-NEXT: ds_read_b32 v3, v2
+; SDAG-NEXT: ; wait_asyncmark(0)
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: ds_read_b32 v2, v2
+; SDAG-NEXT: v_add_u32_e32 v6, v7, v6
+; SDAG-NEXT: s_waitcnt lgkmcnt(1)
+; SDAG-NEXT: v_add3_u32 v0, v6, v3, v0
+; SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_add3_u32 v0, v0, v1, v2
+; SDAG-NEXT: global_store_dword v[4:5], v0, off
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: interleaved_buffer_and_dma:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 m0, s20
+; GISEL-NEXT: global_load_dword v6, v[2:3], off
+; GISEL-NEXT: global_load_dword v7, v[0:1], off
+; GISEL-NEXT: v_mov_b32_e32 v8, 0x54
+; GISEL-NEXT: ; wave barrier
+; GISEL-NEXT: buffer_load_dword v8, s[16:19], 0 offen lds
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: global_load_dword v0, v[0:1], off
+; GISEL-NEXT: v_mov_b32_e32 v1, 0x58
+; GISEL-NEXT: ; wave barrier
+; GISEL-NEXT: buffer_load_dword v1, s[16:19], 0 offen lds
+; GISEL-NEXT: ; wave barrier
+; GISEL-NEXT: global_load_dword v1, v[2:3], off
+; GISEL-NEXT: v_mov_b32_e32 v2, s20
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: ; wait_asyncmark(1)
+; GISEL-NEXT: s_waitcnt vmcnt(3)
+; GISEL-NEXT: ds_read_b32 v3, v2
+; GISEL-NEXT: ; wait_asyncmark(0)
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: ds_read_b32 v2, v2
+; GISEL-NEXT: v_add_u32_e32 v6, v7, v6
+; GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GISEL-NEXT: v_add3_u32 v0, v6, v3, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_add3_u32 v0, v0, v1, v2
+; GISEL-NEXT: global_store_dword v[4:5], v0, off
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
; First batch: global load, global load, async global-to-LDS.
%bar_v11 = load i32, ptr addrspace(1) %bar
@@ -174,39 +253,72 @@ entry:
; Tests that a fence that inserts waits can be used with asyncmark.
define void @fence_with_asyncmark(ptr addrspace(8) inreg %buf, ptr addrspace(1) %foo, ptr addrspace(3) inreg %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out) {
-; WITHASYNC-LABEL: fence_with_asyncmark:
-; WITHASYNC: ; %bb.0: ; %entry
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; WITHASYNC-NEXT: s_mov_b32 m0, s20
-; WITHASYNC-NEXT: global_load_dword v6, v[2:3], off
-; WITHASYNC-NEXT: global_load_dword v7, v[0:1], off
-; WITHASYNC-NEXT: v_mov_b32_e32 v8, 0x54
-; WITHASYNC-NEXT: ; wave barrier
-; WITHASYNC-NEXT: buffer_load_dword v8, s[16:19], 0 offen lds
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0)
-; WITHASYNC-NEXT: ; asyncmark
-; WITHASYNC-NEXT: global_load_dword v0, v[0:1], off
-; WITHASYNC-NEXT: v_mov_b32_e32 v1, 0x58
-; WITHASYNC-NEXT: ; wave barrier
-; WITHASYNC-NEXT: buffer_load_dword v1, s[16:19], 0 offen lds
-; WITHASYNC-NEXT: ; wave barrier
-; WITHASYNC-NEXT: global_load_dword v1, v[2:3], off
-; WITHASYNC-NEXT: v_mov_b32_e32 v2, s20
-; WITHASYNC-NEXT: ; asyncmark
-; WITHASYNC-NEXT: ; wait_asyncmark(1)
-; WITHASYNC-NEXT: ds_read_b32 v3, v2
-; WITHASYNC-NEXT: ; wait_asyncmark(0)
-; WITHASYNC-NEXT: s_waitcnt vmcnt(1)
-; WITHASYNC-NEXT: ds_read_b32 v2, v2
-; WITHASYNC-NEXT: v_add_u32_e32 v6, v7, v6
-; WITHASYNC-NEXT: s_waitcnt lgkmcnt(1)
-; WITHASYNC-NEXT: v_add3_u32 v0, v6, v3, v0
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; WITHASYNC-NEXT: v_add3_u32 v0, v0, v1, v2
-; WITHASYNC-NEXT: global_store_dword v[4:5], v0, off
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0)
-; WITHASYNC-NEXT: s_setpc_b64 s[30:31]
;
+; SDAG-LABEL: fence_with_asyncmark:
+; SDAG: ; %bb.0: ; %entry
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: s_mov_b32 m0, s20
+; SDAG-NEXT: global_load_dword v6, v[2:3], off
+; SDAG-NEXT: global_load_dword v7, v[0:1], off
+; SDAG-NEXT: v_mov_b32_e32 v8, 0x54
+; SDAG-NEXT: ; wave barrier
+; SDAG-NEXT: buffer_load_dword v8, s[16:19], 0 offen lds
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: global_load_dword v0, v[0:1], off
+; SDAG-NEXT: v_mov_b32_e32 v1, 0x58
+; SDAG-NEXT: ; wave barrier
+; SDAG-NEXT: buffer_load_dword v1, s[16:19], 0 offen lds
+; SDAG-NEXT: ; wave barrier
+; SDAG-NEXT: global_load_dword v1, v[2:3], off
+; SDAG-NEXT: v_mov_b32_e32 v2, s20
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: ; wait_asyncmark(1)
+; SDAG-NEXT: ds_read_b32 v3, v2
+; SDAG-NEXT: ; wait_asyncmark(0)
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: ds_read_b32 v2, v2
+; SDAG-NEXT: v_add_u32_e32 v6, v7, v6
+; SDAG-NEXT: s_waitcnt lgkmcnt(1)
+; SDAG-NEXT: v_add3_u32 v0, v6, v3, v0
+; SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_add3_u32 v0, v0, v1, v2
+; SDAG-NEXT: global_store_dword v[4:5], v0, off
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: fence_with_asyncmark:
+; GISEL: ; %bb.0: ; %entry
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b32 m0, s20
+; GISEL-NEXT: global_load_dword v6, v[2:3], off
+; GISEL-NEXT: global_load_dword v7, v[0:1], off
+; GISEL-NEXT: v_mov_b32_e32 v8, 0x54
+; GISEL-NEXT: ; wave barrier
+; GISEL-NEXT: buffer_load_dword v8, s[16:19], 0 offen lds
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: global_load_dword v0, v[0:1], off
+; GISEL-NEXT: v_mov_b32_e32 v1, 0x58
+; GISEL-NEXT: ; wave barrier
+; GISEL-NEXT: buffer_load_dword v1, s[16:19], 0 offen lds
+; GISEL-NEXT: ; wave barrier
+; GISEL-NEXT: global_load_dword v1, v[2:3], off
+; GISEL-NEXT: v_mov_b32_e32 v2, s20
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: ; wait_asyncmark(1)
+; GISEL-NEXT: ds_read_b32 v3, v2
+; GISEL-NEXT: ; wait_asyncmark(0)
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: ds_read_b32 v2, v2
+; GISEL-NEXT: v_add_u32_e32 v6, v7, v6
+; GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GISEL-NEXT: v_add3_u32 v0, v6, v3, v0
+; GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_add3_u32 v0, v0, v1, v2
+; GISEL-NEXT: global_store_dword v[4:5], v0, off
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
entry:
; First batch: global load, global load, async global-to-LDS.
%bar_v11 = load i32, ptr addrspace(1) %bar
@@ -248,46 +360,88 @@ entry:
; those outstanding operations.
define void @test_pipelined_loop(ptr addrspace(1) %foo, ptr addrspace(3) %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out, i32 %n) {
-; WITHASYNC-LABEL: test_pipelined_loop:
-; WITHASYNC: ; %bb.0: ; %prolog
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; WITHASYNC-NEXT: v_readfirstlane_b32 s4, v2
-; WITHASYNC-NEXT: s_mov_b32 m0, s4
-; WITHASYNC-NEXT: v_mov_b32_e32 v5, 0
-; WITHASYNC-NEXT: global_load_dword v[0:1], off lds
-; WITHASYNC-NEXT: ; asyncmark
-; WITHASYNC-NEXT: global_load_dword v[0:1], off lds
-; WITHASYNC-NEXT: s_mov_b32 s6, 2
-; WITHASYNC-NEXT: s_mov_b64 s[4:5], 0
-; WITHASYNC-NEXT: ; asyncmark
-; WITHASYNC-NEXT: .LBB4_1: ; %loop_body
-; WITHASYNC-NEXT: ; =>This Inner Loop Header: Depth=1
-; WITHASYNC-NEXT: v_readfirstlane_b32 s7, v2
-; WITHASYNC-NEXT: s_mov_b32 m0, s7
-; WITHASYNC-NEXT: s_add_i32 s6, s6, 1
-; WITHASYNC-NEXT: global_load_dword v[0:1], off lds
-; WITHASYNC-NEXT: ; asyncmark
-; WITHASYNC-NEXT: ; wait_asyncmark(2)
-; WITHASYNC-NEXT: s_waitcnt vmcnt(2)
-; WITHASYNC-NEXT: ds_read_b32 v6, v2
-; WITHASYNC-NEXT: v_cmp_ge_i32_e32 vcc, s6, v7
-; WITHASYNC-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; WITHASYNC-NEXT: s_waitcnt lgkmcnt(0)
-; WITHASYNC-NEXT: v_add_u32_e32 v5, v5, v6
-; WITHASYNC-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; WITHASYNC-NEXT: s_cbranch_execnz .LBB4_1
-; WITHASYNC-NEXT: ; %bb.2: ; %epilog
-; WITHASYNC-NEXT: s_or_b64 exec, exec, s[4:5]
-; WITHASYNC-NEXT: ; wait_asyncmark(1)
-; WITHASYNC-NEXT: s_waitcnt vmcnt(1)
-; WITHASYNC-NEXT: ds_read_b32 v0, v2
-; WITHASYNC-NEXT: ; wait_asyncmark(0)
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; WITHASYNC-NEXT: v_add_u32_e32 v0, v5, v0
-; WITHASYNC-NEXT: global_store_dword v[3:4], v0, off
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0)
-; WITHASYNC-NEXT: s_setpc_b64 s[30:31]
;
+; SDAG-LABEL: test_pipelined_loop:
+; SDAG: ; %bb.0: ; %prolog
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; SDAG-NEXT: s_mov_b32 m0, s4
+; SDAG-NEXT: v_mov_b32_e32 v5, 0
+; SDAG-NEXT: global_load_dword v[0:1], off lds
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: global_load_dword v[0:1], off lds
+; SDAG-NEXT: s_mov_b32 s6, 2
+; SDAG-NEXT: s_mov_b64 s[4:5], 0
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: .LBB4_1: ; %loop_body
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_readfirstlane_b32 s7, v2
+; SDAG-NEXT: s_mov_b32 m0, s7
+; SDAG-NEXT: s_add_i32 s6, s6, 1
+; SDAG-NEXT: global_load_dword v[0:1], off lds
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: ; wait_asyncmark(2)
+; SDAG-NEXT: s_waitcnt vmcnt(2)
+; SDAG-NEXT: ds_read_b32 v6, v2
+; SDAG-NEXT: v_cmp_ge_i32_e32 vcc, s6, v7
+; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_add_u32_e32 v5, v5, v6
+; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execnz .LBB4_1
+; SDAG-NEXT: ; %bb.2: ; %epilog
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: ; wait_asyncmark(1)
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: ds_read_b32 v0, v2
+; SDAG-NEXT: ; wait_asyncmark(0)
+; SDAG-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_add_u32_e32 v0, v5, v0
+; SDAG-NEXT: global_store_dword v[3:4], v0, off
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_pipelined_loop:
+; GISEL: ; %bb.0: ; %prolog
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GISEL-NEXT: s_mov_b32 m0, s4
+; GISEL-NEXT: s_mov_b32 s6, 0
+; GISEL-NEXT: global_load_dword v[0:1], off lds
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: global_load_dword v[0:1], off lds
+; GISEL-NEXT: s_mov_b32 s7, 2
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-NEXT: v_mov_b32_e32 v6, s7
+; GISEL-NEXT: v_mov_b32_e32 v5, s6
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: .LBB4_1: ; %loop_body
+; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_readfirstlane_b32 s6, v2
+; GISEL-NEXT: s_mov_b32 m0, s6
+; GISEL-NEXT: v_add_u32_e32 v6, 1, v6
+; GISEL-NEXT: global_load_dword v[0:1], off lds
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: ; wait_asyncmark(2)
+; GISEL-NEXT: s_waitcnt vmcnt(2)
+; GISEL-NEXT: ds_read_b32 v8, v2
+; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, v6, v7
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_add_u32_e32 v5, v5, v8
+; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execnz .LBB4_1
+; GISEL-NEXT: ; %bb.2: ; %epilog
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: ; wait_asyncmark(1)
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: ds_read_b32 v0, v2
+; GISEL-NEXT: ; wait_asyncmark(0)
+; GISEL-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_add_u32_e32 v0, v5, v0
+; GISEL-NEXT: global_store_dword v[3:4], v0, off
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
prolog:
; Load first iteration
call void @llvm.amdgcn.global.load.async.lds(ptr addrspace(1) %foo, ptr addrspace(3) %lds, i32 4, i32 0, i32 0)
@@ -335,67 +489,129 @@ epilog:
; Software pipelined loop with async global-to-LDS and global loads
define void @test_pipelined_loop_with_global(ptr addrspace(1) %foo, ptr addrspace(3) %lds, ptr addrspace(1) %bar, ptr addrspace(1) %out, i32 %n) {
-; WITHASYNC-LABEL: test_pipelined_loop_with_global:
-; WITHASYNC: ; %bb.0: ; %prolog
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; WITHASYNC-NEXT: v_readfirstlane_b32 s4, v2
-; WITHASYNC-NEXT: s_mov_b32 m0, s4
-; WITHASYNC-NEXT: global_load_dword v10, v[0:1], off
-; WITHASYNC-NEXT: global_load_dword v14, v[3:4], off
-; WITHASYNC-NEXT: s_mov_b32 s6, 2
-; WITHASYNC-NEXT: global_load_dword v[0:1], off lds
-; WITHASYNC-NEXT: ; asyncmark
-; WITHASYNC-NEXT: global_load_dword v8, v[0:1], off
-; WITHASYNC-NEXT: global_load_dword v9, v[3:4], off
-; WITHASYNC-NEXT: s_mov_b64 s[4:5], 0
-; WITHASYNC-NEXT: global_load_dword v[0:1], off lds
-; WITHASYNC-NEXT: ; asyncmark
-; WITHASYNC-NEXT: s_waitcnt vmcnt(2)
-; WITHASYNC-NEXT: v_mov_b32_e32 v13, v8
-; WITHASYNC-NEXT: s_waitcnt vmcnt(1)
-; WITHASYNC-NEXT: v_mov_b32_e32 v15, v9
-; WITHASYNC-NEXT: .LBB5_1: ; %loop_body
-; WITHASYNC-NEXT: ; =>This Inner Loop Header: Depth=1
-; WITHASYNC-NEXT: v_readfirstlane_b32 s7, v2
-; WITHASYNC-NEXT: s_waitcnt vmcnt(1)
-; WITHASYNC-NEXT: v_mov_b32_e32 v12, v15
-; WITHASYNC-NEXT: v_mov_b32_e32 v11, v13
-; WITHASYNC-NEXT: global_load_dword v13, v[0:1], off
-; WITHASYNC-NEXT: global_load_dword v15, v[3:4], off
-; WITHASYNC-NEXT: s_mov_b32 m0, s7
-; WITHASYNC-NEXT: s_add_i32 s6, s6, 1
-; WITHASYNC-NEXT: global_load_dword v[0:1], off lds
-; WITHASYNC-NEXT: v_cmp_ge_i32_e32 vcc, s6, v7
-; WITHASYNC-NEXT: v_mov_b32_e32 v16, v14
-; WITHASYNC-NEXT: v_mov_b32_e32 v17, v10
-; WITHASYNC-NEXT: v_mov_b32_e32 v10, v8
-; WITHASYNC-NEXT: v_mov_b32_e32 v14, v9
-; WITHASYNC-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; WITHASYNC-NEXT: ; asyncmark
-; WITHASYNC-NEXT: ; wait_asyncmark(2)
-; WITHASYNC-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; WITHASYNC-NEXT: s_cbranch_execnz .LBB5_1
-; WITHASYNC-NEXT: ; %bb.2: ; %epilog
-; WITHASYNC-NEXT: s_or_b64 exec, exec, s[4:5]
-; WITHASYNC-NEXT: ds_read_b32 v0, v2
-; WITHASYNC-NEXT: ; wait_asyncmark(1)
-; WITHASYNC-NEXT: s_waitcnt vmcnt(3)
-; WITHASYNC-NEXT: ds_read_b32 v1, v2
-; WITHASYNC-NEXT: ; wait_asyncmark(0)
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0)
-; WITHASYNC-NEXT: ds_read_b32 v2, v2
-; WITHASYNC-NEXT: v_add_u32_e32 v3, v17, v16
-; WITHASYNC-NEXT: s_waitcnt lgkmcnt(2)
-; WITHASYNC-NEXT: v_add3_u32 v0, v3, v0, v12
-; WITHASYNC-NEXT: s_waitcnt lgkmcnt(1)
-; WITHASYNC-NEXT: v_add3_u32 v0, v11, v0, v1
-; WITHASYNC-NEXT: v_add_u32_e32 v1, v13, v15
-; WITHASYNC-NEXT: s_waitcnt lgkmcnt(0)
-; WITHASYNC-NEXT: v_add3_u32 v0, v1, v2, v0
-; WITHASYNC-NEXT: global_store_dword v[5:6], v0, off
-; WITHASYNC-NEXT: s_waitcnt vmcnt(0)
-; WITHASYNC-NEXT: s_setpc_b64 s[30:31]
;
+; SDAG-LABEL: test_pipelined_loop_with_global:
+; SDAG: ; %bb.0: ; %prolog
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_readfirstlane_b32 s4, v2
+; SDAG-NEXT: s_mov_b32 m0, s4
+; SDAG-NEXT: global_load_dword v10, v[0:1], off
+; SDAG-NEXT: global_load_dword v14, v[3:4], off
+; SDAG-NEXT: s_mov_b32 s6, 2
+; SDAG-NEXT: global_load_dword v[0:1], off lds
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: global_load_dword v8, v[0:1], off
+; SDAG-NEXT: global_load_dword v9, v[3:4], off
+; SDAG-NEXT: s_mov_b64 s[4:5], 0
+; SDAG-NEXT: global_load_dword v[0:1], off lds
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: s_waitcnt vmcnt(2)
+; SDAG-NEXT: v_mov_b32_e32 v13, v8
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_mov_b32_e32 v15, v9
+; SDAG-NEXT: .LBB5_1: ; %loop_body
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_readfirstlane_b32 s7, v2
+; SDAG-NEXT: s_waitcnt vmcnt(1)
+; SDAG-NEXT: v_mov_b32_e32 v12, v15
+; SDAG-NEXT: v_mov_b32_e32 v11, v13
+; SDAG-NEXT: global_load_dword v13, v[0:1], off
+; SDAG-NEXT: global_load_dword v15, v[3:4], off
+; SDAG-NEXT: s_mov_b32 m0, s7
+; SDAG-NEXT: s_add_i32 s6, s6, 1
+; SDAG-NEXT: global_load_dword v[0:1], off lds
+; SDAG-NEXT: v_cmp_ge_i32_e32 vcc, s6, v7
+; SDAG-NEXT: v_mov_b32_e32 v16, v14
+; SDAG-NEXT: v_mov_b32_e32 v17, v10
+; SDAG-NEXT: v_mov_b32_e32 v10, v8
+; SDAG-NEXT: v_mov_b32_e32 v14, v9
+; SDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; SDAG-NEXT: ; asyncmark
+; SDAG-NEXT: ; wait_asyncmark(2)
+; SDAG-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execnz .LBB5_1
+; SDAG-NEXT: ; %bb.2: ; %epilog
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: ds_read_b32 v0, v2
+; SDAG-NEXT: ; wait_asyncmark(1)
+; SDAG-NEXT: s_waitcnt vmcnt(3)
+; SDAG-NEXT: ds_read_b32 v1, v2
+; SDAG-NEXT: ; wait_asyncmark(0)
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: ds_read_b32 v2, v2
+; SDAG-NEXT: v_add_u32_e32 v3, v17, v16
+; SDAG-NEXT: s_waitcnt lgkmcnt(2)
+; SDAG-NEXT: v_add3_u32 v0, v3, v0, v12
+; SDAG-NEXT: s_waitcnt lgkmcnt(1)
+; SDAG-NEXT: v_add3_u32 v0, v11, v0, v1
+; SDAG-NEXT: v_add_u32_e32 v1, v13, v15
+; SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; SDAG-NEXT: v_add3_u32 v0, v1, v2, v0
+; SDAG-NEXT: global_store_dword v[5:6], v0, off
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: test_pipelined_loop_with_global:
+; GISEL: ; %bb.0: ; %prolog
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_readfirstlane_b32 s4, v2
+; GISEL-NEXT: s_mov_b32 m0, s4
+; GISEL-NEXT: global_load_dword v10, v[0:1], off
+; GISEL-NEXT: global_load_dword v14, v[3:4], off
+; GISEL-NEXT: s_mov_b32 s6, 2
+; GISEL-NEXT: global_load_dword v[0:1], off lds
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: global_load_dword v8, v[0:1], off
+; GISEL-NEXT: global_load_dword v9, v[3:4], off
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-NEXT: global_load_dword v[0:1], off lds
+; GISEL-NEXT: v_mov_b32_e32 v16, s6
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: s_waitcnt vmcnt(2)
+; GISEL-NEXT: v_mov_b32_e32 v13, v8
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_mov_b32_e32 v15, v9
+; GISEL-NEXT: .LBB5_1: ; %loop_body
+; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_readfirstlane_b32 s6, v2
+; GISEL-NEXT: s_waitcnt vmcnt(1)
+; GISEL-NEXT: v_mov_b32_e32 v12, v15
+; GISEL-NEXT: v_mov_b32_e32 v11, v13
+; GISEL-NEXT: global_load_dword v13, v[0:1], off
+; GISEL-NEXT: global_load_dword v15, v[3:4], off
+; GISEL-NEXT: s_mov_b32 m0, s6
+; GISEL-NEXT: v_add_u32_e32 v16, 1, v16
+; GISEL-NEXT: global_load_dword v[0:1], off lds
+; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, v16, v7
+; GISEL-NEXT: v_mov_b32_e32 v17, v14
+; GISEL-NEXT: v_mov_b32_e32 v18, v10
+; GISEL-NEXT: v_mov_b32_e32 v10, v8
+; GISEL-NEXT: v_mov_b32_e32 v14, v9
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: ; asyncmark
+; GISEL-NEXT: ; wait_asyncmark(2)
+; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execnz .LBB5_1
+; GISEL-NEXT: ; %bb.2: ; %epilog
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: ds_read_b32 v0, v2
+; GISEL-NEXT: ; wait_asyncmark(1)
+; GISEL-NEXT: s_waitcnt vmcnt(3)
+; GISEL-NEXT: ds_read_b32 v1, v2
+; GISEL-NEXT: ; wait_asyncmark(0)
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: ds_read_b32 v2, v2
+; GISEL-NEXT: v_add_u32_e32 v3, v18, v17
+; GISEL-NEXT: s_waitcnt lgkmcnt(2)
+; GISEL-NEXT: v_add3_u32 v0, v3, v0, v12
+; GISEL-NEXT: s_waitcnt lgkmcnt(1)
+; GISEL-NEXT: v_add3_u32 v0, v11, v0, v1
+; GISEL-NEXT: v_add_u32_e32 v1, v13, v15
+; GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GISEL-NEXT: v_add3_u32 v0, v1, v2, v0
+; GISEL-NEXT: global_store_dword v[5:6], v0, off
+; GISEL-NEXT: s_waitcnt vmcnt(0)
+; GISEL-NEXT: s_setpc_b64 s[30:31]
prolog:
; Load first iteration
%v0 = load i32, ptr addrspace(1) %foo
More information about the llvm-branch-commits
mailing list