[llvm] b5818e4 - [AMDGPU] Cluster stores as well as loads for GFX11
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Mon Jun 27 08:41:51 PDT 2022
Author: Jay Foad
Date: 2022-06-27T16:41:41+01:00
New Revision: b5818e4eb439bb6a14714c14541f3d9e387d39b8
URL: https://github.com/llvm/llvm-project/commit/b5818e4eb439bb6a14714c14541f3d9e387d39b8
DIFF: https://github.com/llvm/llvm-project/commit/b5818e4eb439bb6a14714c14541f3d9e387d39b8.diff
LOG: [AMDGPU] Cluster stores as well as loads for GFX11
Differential Revision: https://reviews.llvm.org/D128517
Added:
Modified:
llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
llvm/test/CodeGen/AMDGPU/cluster_stores.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 8f01a8b8b3f10..f00d7511965a8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -396,9 +396,12 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
static ScheduleDAGInstrs *
createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+ const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG =
new GCNScheduleDAGMILive(C, std::make_unique<GCNMaxOccupancySchedStrategy>(C));
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ if (ST.shouldClusterStores())
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createIGroupLPDAGMutation());
DAG->addMutation(createSchedBarrierDAGMutation());
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
@@ -408,9 +411,12 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
static ScheduleDAGInstrs *
createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+ const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
auto DAG = new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ if (ST.shouldClusterStores())
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}
@@ -421,9 +427,12 @@ static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
static ScheduleDAGInstrs *
createIterativeILPMachineScheduler(MachineSchedContext *C) {
+ const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
auto DAG = new GCNIterativeScheduler(C,
GCNIterativeScheduler::SCHEDULE_ILP);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ if (ST.shouldClusterStores())
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
return DAG;
}
@@ -898,6 +907,8 @@ class GCNPassConfig final : public AMDGPUPassConfig {
ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ if (ST.shouldClusterStores())
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII));
DAG->addMutation(createIGroupLPDAGMutation());
DAG->addMutation(createSchedBarrierDAGMutation());
@@ -1102,8 +1113,11 @@ bool AMDGPUPassConfig::addGCPasses() {
llvm::ScheduleDAGInstrs *
AMDGPUPassConfig::createMachineScheduler(MachineSchedContext *C) const {
+ const GCNSubtarget &ST = C->MF->getSubtarget<GCNSubtarget>();
ScheduleDAGMILive *DAG = createGenericSchedLive(C);
DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+ if (ST.shouldClusterStores())
+ DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
return DAG;
}
diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
index 763ead034f612..bb96036f3c42d 100644
--- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -3,6 +3,8 @@
; RUN: FileCheck --enable-var-scope --check-prefix=DBG %s < %t
; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX10 %s
; RUN: FileCheck --enable-var-scope --check-prefix=DBG %s < %t
+; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -debug-only=machine-scheduler < %s 2> %t | FileCheck --enable-var-scope --check-prefix=GFX11 %s
+; RUN: FileCheck --enable-var-scope --check-prefixes=DBG,DBG11 %s < %t
; REQUIRES: asserts
; FIXME: Verifier error with xnack enabled.
@@ -22,6 +24,10 @@
; DBG: Cluster ld/st SU([[L2]]) - SU([[L3:[0-9]+]])
; DBG: Cluster ld/st SU([[L3]]) - SU([[L4:[0-9]+]])
+; DBG11: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]])
+; DBG11: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]])
+; DBG11: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]])
+
; DBG-NOT: Cluster ld/st
define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noalias %sb) {
@@ -93,6 +99,31 @@ define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noa
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
; GFX10-NEXT: flat_store_dword v[6:7], v11
; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: cluster_load_cluster_store:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: flat_load_b32 v2, v[0:1]
+; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:8
+; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:16
+; GFX11-NEXT: flat_load_b32 v5, v[0:1] offset:24
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; GFX11-NEXT: flat_store_b32 v[0:1], v2
+; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3)
+; GFX11-NEXT: flat_store_b32 v[0:1], v3 offset:8
+; GFX11-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3)
+; GFX11-NEXT: flat_store_b32 v[0:1], v4 offset:16
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
+; GFX11-NEXT: flat_store_b32 v[0:1], v5 offset:24
+; GFX11-NEXT: s_endpgm
bb:
%la0 = getelementptr inbounds i32, i32* %lb, i32 0
%ld0 = load i32, i32* %la0
@@ -130,6 +161,10 @@ bb:
; DBG: Cluster ld/st SU([[L2]]) - SU([[L3:[0-9]+]])
; DBG: Cluster ld/st SU([[L3]]) - SU([[L4:[0-9]+]])
+; DBG11: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]])
+; DBG11: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]])
+; DBG11: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]])
+
; DBG-NOT: Cluster ld/st
define amdgpu_kernel void @cluster_load_valu_cluster_store(i32* noalias %lb, i32* noalias %sb) {
@@ -203,6 +238,33 @@ define amdgpu_kernel void @cluster_load_valu_cluster_store(i32* noalias %lb, i32
; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
; GFX10-NEXT: flat_store_dword v[6:7], v10
; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: cluster_load_valu_cluster_store:
+; GFX11: ; %bb.0: ; %bb
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24
+; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c
+; GFX11-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-NEXT: v_mov_b32_e32 v0, s2
+; GFX11-NEXT: v_mov_b32_e32 v1, s3
+; GFX11-NEXT: s_clause 0x3
+; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:8
+; GFX11-NEXT: flat_load_b32 v3, v[0:1]
+; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:16
+; GFX11-NEXT: flat_load_b32 v5, v[0:1] offset:24
+; GFX11-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-NEXT: v_mov_b32_e32 v1, s1
+; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3)
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 1, v2
+; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2)
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: flat_store_b32 v[0:1], v3
+; GFX11-NEXT: flat_store_b32 v[0:1], v2 offset:8
+; GFX11-NEXT: s_waitcnt vmcnt(1) lgkmcnt(3)
+; GFX11-NEXT: flat_store_b32 v[0:1], v4 offset:16
+; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(3)
+; GFX11-NEXT: flat_store_b32 v[0:1], v5 offset:24
+; GFX11-NEXT: s_endpgm
bb:
%la0 = getelementptr inbounds i32, i32* %lb, i32 0
%ld0 = load i32, i32* %la0
@@ -266,6 +328,23 @@ define amdgpu_ps void @cluster_image_load(<8 x i32> inreg %src, <8 x i32> inreg
; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
; GFX10-NEXT: image_store v[2:5], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: cluster_image_load:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: v_add_nc_u32_e32 v2, 1, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 1, v1
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 2, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 2, v1
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: image_load v[2:5], v[2:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
+; GFX11-NEXT: image_load v[6:9], v[6:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX11-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT: image_store v[2:5], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
+; GFX11-NEXT: s_endpgm
entry:
%x1 = add i32 %x, 1
%y1 = add i32 %y, 1
@@ -309,6 +388,19 @@ define amdgpu_ps void @no_cluster_image_load(<8 x i32> inreg %src1, <8 x i32> in
; GFX10-NEXT: v_add_f32_e32 v2, v2, v6
; GFX10-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: no_cluster_image_load:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: v_mov_b32_e32 v6, 0
+; GFX11-NEXT: image_load_mip v[2:5], [v0, v1, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
+; GFX11-NEXT: image_load_mip v[6:9], [v0, v1, v6], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX11-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
+; GFX11-NEXT: s_endpgm
entry:
%val1 = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %x, i32 %y, i32 0, <8 x i32> %src1, i32 0, i32 0)
%val2 = call <4 x float> @llvm.amdgcn.image.load.mip.2d.v4f32.i32(i32 15, i32 %x, i32 %y, i32 0, <8 x i32> %src2, i32 0, i32 0)
@@ -377,6 +469,33 @@ define amdgpu_ps void @cluster_image_sample(<8 x i32> inreg %src, <4 x i32> inre
; GFX10-NEXT: v_add_f32_e32 v2, v14, v18
; GFX10-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
; GFX10-NEXT: s_endpgm
+;
+; GFX11-LABEL: cluster_image_sample:
+; GFX11: ; %bb.0: ; %entry
+; GFX11-NEXT: v_cvt_f32_i32_e32 v8, v0
+; GFX11-NEXT: v_cvt_f32_i32_e32 v9, v1
+; GFX11-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-NEXT: v_mov_b32_e32 v10, 1.0
+; GFX11-NEXT: v_add_f32_e32 v2, 1.0, v8
+; GFX11-NEXT: v_add_f32_e32 v3, 1.0, v9
+; GFX11-NEXT: v_mov_b32_e32 v5, v4
+; GFX11-NEXT: v_mov_b32_e32 v6, v4
+; GFX11-NEXT: v_mov_b32_e32 v7, v4
+; GFX11-NEXT: v_add_f32_e32 v8, 2.0, v8
+; GFX11-NEXT: v_add_f32_e32 v9, 2.0, v9
+; GFX11-NEXT: v_mov_b32_e32 v11, v10
+; GFX11-NEXT: v_mov_b32_e32 v12, v10
+; GFX11-NEXT: v_mov_b32_e32 v13, v10
+; GFX11-NEXT: s_clause 0x1
+; GFX11-NEXT: image_sample_d v[2:5], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: image_sample_d v[6:9], v[8:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT: s_waitcnt vmcnt(0)
+; GFX11-NEXT: v_add_f32_e32 v5, v5, v9
+; GFX11-NEXT: v_add_f32_e32 v4, v4, v8
+; GFX11-NEXT: v_add_f32_e32 v3, v3, v7
+; GFX11-NEXT: v_add_f32_e32 v2, v2, v6
+; GFX11-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm
+; GFX11-NEXT: s_endpgm
entry:
%s = sitofp i32 %x to float
%t = sitofp i32 %y to float
More information about the llvm-commits
mailing list