[llvm] 0d7bd34 - [MachineScheduler] Ignore artificial edges when forming store chains
Jay Foad via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 29 08:23:18 PST 2020
Author: Jay Foad
Date: 2020-01-29T16:23:01Z
New Revision: 0d7bd343127ee161a01a1509effba0fdd480c9e1
URL: https://github.com/llvm/llvm-project/commit/0d7bd343127ee161a01a1509effba0fdd480c9e1
DIFF: https://github.com/llvm/llvm-project/commit/0d7bd343127ee161a01a1509effba0fdd480c9e1.diff
LOG: [MachineScheduler] Ignore artificial edges when forming store chains
Summary:
BaseMemOpClusterMutation::apply forms store chains by looking for
control (i.e. non-data) dependencies from one mem op to another.
In the test case, clusterNeighboringMemOps successfully clusters the
loads, and then adds artificial edges to the loads' successors as
described in the comment:
// Copy successor edges from SUa to SUb. Interleaving computation
// dependent on SUa can prevent load combining due to register reuse.
The effect of this is that *data* dependencies from one load to a store
are copied as *artificial* dependencies from a different load to the
same store.
Then when BaseMemOpClusterMutation::apply looks at the stores, it finds
that some of them have a control dependency on a previous load, which
breaks the chains and means that the stores are not all considered part
of the same chain and won't all be clustered.
The fix is to only consider non-artificial control dependencies when
forming chains.
Subscribers: MatzeB, jvesely, nhaehnle, hiraditya, javed.absar, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D71717
Added:
llvm/test/CodeGen/AMDGPU/cluster_stores.ll
Modified:
llvm/lib/CodeGen/MachineScheduler.cpp
Removed:
################################################################################
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index aaa189e4b8e4..ecfccaab059f 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1620,7 +1620,7 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
unsigned ChainPredID = DAG->SUnits.size();
for (const SDep &Pred : SU.Preds) {
- if (Pred.isCtrl()) {
+ if (Pred.isCtrl() && !Pred.isArtificial()) {
ChainPredID = Pred.getSUnit()->NodeNum;
break;
}
diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
new file mode 100644
index 000000000000..466d69002af5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll
@@ -0,0 +1,85 @@
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -debug-only=machine-scheduler -o /dev/null %s 2>&1 | FileCheck --enable-var-scope --check-prefixes=CHECK,DBG %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck --enable-var-scope --check-prefixes=CHECK,GCN %s
+; REQUIRES: asserts
+
+; CHECK-LABEL: {{^}}cluster_load_cluster_store:
+define amdgpu_kernel void @cluster_load_cluster_store(i32* noalias %lb, i32* noalias %sb) {
+bb:
+; DBG: Cluster ld/st SU(1) - SU(2)
+
+; DBG: Cluster ld/st SU([[L1:[0-9]+]]) - SU([[L2:[0-9]+]])
+; DBG: Cluster ld/st SU([[L2]]) - SU([[L3:[0-9]+]])
+; DBG: Cluster ld/st SU([[L3]]) - SU([[L4:[0-9]+]])
+; GCN: flat_load_dword [[LD1:v[0-9]+]], v[{{[0-9:]+}}]
+; GCN-NEXT: flat_load_dword [[LD2:v[0-9]+]], v[{{[0-9:]+}}] offset:8
+; GCN-NEXT: flat_load_dword [[LD3:v[0-9]+]], v[{{[0-9:]+}}] offset:16
+; GCN-NEXT: flat_load_dword [[LD4:v[0-9]+]], v[{{[0-9:]+}}] offset:24
+ %la0 = getelementptr inbounds i32, i32* %lb, i32 0
+ %ld0 = load i32, i32* %la0
+ %la1 = getelementptr inbounds i32, i32* %lb, i32 2
+ %ld1 = load i32, i32* %la1
+ %la2 = getelementptr inbounds i32, i32* %lb, i32 4
+ %ld2 = load i32, i32* %la2
+ %la3 = getelementptr inbounds i32, i32* %lb, i32 6
+ %ld3 = load i32, i32* %la3
+
+; DBG: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]])
+; DBG: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]])
+; DBG: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]])
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[LD1]]
+; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD2]] offset:8
+; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD3]] offset:16
+; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD4]] offset:24
+ %sa0 = getelementptr inbounds i32, i32* %sb, i32 0
+ store i32 %ld0, i32* %sa0
+ %sa1 = getelementptr inbounds i32, i32* %sb, i32 2
+ store i32 %ld1, i32* %sa1
+ %sa2 = getelementptr inbounds i32, i32* %sb, i32 4
+ store i32 %ld2, i32* %sa2
+ %sa3 = getelementptr inbounds i32, i32* %sb, i32 6
+ store i32 %ld3, i32* %sa3
+
+ ret void
+}
+
+; CHECK-LABEL: {{^}}cluster_load_valu_cluster_store:
+define amdgpu_kernel void @cluster_load_valu_cluster_store(i32* noalias %lb, i32* noalias %sb) {
+bb:
+; DBG: Cluster ld/st SU(1) - SU(2)
+
+; DBG: Cluster ld/st SU([[L1:[0-9]+]]) - SU([[L2:[0-9]+]])
+; DBG: Cluster ld/st SU([[L2]]) - SU([[L3:[0-9]+]])
+; DBG: Cluster ld/st SU([[L3]]) - SU([[L4:[0-9]+]])
+; GCN: flat_load_dword [[LD1:v[0-9]+]], v[{{[0-9:]+}}]
+; GCN-NEXT: flat_load_dword [[LD2:v[0-9]+]], v[{{[0-9:]+}}] offset:8
+; GCN-NEXT: flat_load_dword [[LD3:v[0-9]+]], v[{{[0-9:]+}}] offset:16
+; GCN-NEXT: flat_load_dword [[LD4:v[0-9]+]], v[{{[0-9:]+}}] offset:24
+ %la0 = getelementptr inbounds i32, i32* %lb, i32 0
+ %ld0 = load i32, i32* %la0
+ %la1 = getelementptr inbounds i32, i32* %lb, i32 2
+ %ld1 = load i32, i32* %la1
+ %la2 = getelementptr inbounds i32, i32* %lb, i32 4
+ %ld2 = load i32, i32* %la2
+ %la3 = getelementptr inbounds i32, i32* %lb, i32 6
+ %ld3 = load i32, i32* %la3
+
+; DBG: Cluster ld/st SU([[S1:[0-9]+]]) - SU([[S2:[0-9]+]])
+; DBG: Cluster ld/st SU([[S2]]) - SU([[S3:[0-9]+]])
+; DBG: Cluster ld/st SU([[S3]]) - SU([[S4:[0-9]+]])
+; GCN: v_add_u32_e32 [[ST2:v[0-9]+]], 1, [[LD2]]
+; GCN: flat_store_dword v[{{[0-9:]+}}], [[LD1]]
+; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[ST2]] offset:8
+; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD3]] offset:16
+; GCN-NEXT: flat_store_dword v[{{[0-9:]+}}], [[LD4]] offset:24
+ %sa0 = getelementptr inbounds i32, i32* %sb, i32 0
+ store i32 %ld0, i32* %sa0
+ %sa1 = getelementptr inbounds i32, i32* %sb, i32 2
+ %add = add i32 %ld1, 1
+ store i32 %add, i32* %sa1
+ %sa2 = getelementptr inbounds i32, i32* %sb, i32 4
+ store i32 %ld2, i32* %sa2
+ %sa3 = getelementptr inbounds i32, i32* %sb, i32 6
+ store i32 %ld3, i32* %sa3
+
+ ret void
+}
More information about the llvm-commits
mailing list