[PATCH] D84139: [Scheduling] Improve group algorithm for store cluster

Qing Shan Zhang via Phabricator via llvm-commits llvm-commits at lists.llvm.org
Mon Jul 20 00:23:06 PDT 2020


steven.zhang created this revision.
steven.zhang added reviewers: fhahn, evandro, arsenm, jsji, kbarton, rampitec, PowerPC.
Herald added subscribers: wuzish, javed.absar, hiraditya, wdng, MatzeB.
Herald added a project: LLVM.

The scheduler will try to classify the MemOps into different groups and then clustering neighboring MemOps within each group. The current algorithm is to have the MemOps with the same ctrl(non-data) dep into the same group. That works fine for load but not well for store.

See this example: Store Addr and Store Addr+8 are clusterable pair. They have memory(ctrl) dependency on different loads. Current implementation will put these two stores into different group and miss to cluster them.

  Load X               Load Y
    ^                    ^
    |                    |
    |mem                 |mem
    |                    |
    +                    +
  Store Addr           Store Addr+8
    ^                    ^
    +--------------------+
           cluster

This will affect the case like this.

  void foo(long long *restrict a, long long *restrict b, long long *restrict c, int n) {
          for (int i =0; i<n;i++)
              a[i] += b[i]*c[i];
  }


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D84139

Files:
  llvm/lib/CodeGen/MachineScheduler.cpp
  llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll


Index: llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
===================================================================
--- llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
+++ llvm/test/CodeGen/AArch64/aarch64-stp-cluster.ll
@@ -147,3 +147,50 @@
   ret i64 %v
 }
 
+; CHECK: ********** MI Scheduling **********
+; CHECK-LABEL: stp_i64_with_ld:%bb.0
+; CHECK:Cluster ld/st SU(5) - SU(10)
+; CHECK:Cluster ld/st SU(15) - SU(20)
+; CHECK:SU(5):   STRXui %7:gpr64, %0:gpr64common, 0 ::
+; CHECK:SU(10):   STRXui %12:gpr64, %0:gpr64common, 1 ::
+; CHECK:SU(15):   STRXui %17:gpr64, %0:gpr64common, 2 ::
+; CHECK:SU(20):   STRXui %22:gpr64, %0:gpr64common, 3 ::
+define void @stp_i64_with_ld(i64* noalias nocapture %a, i64* noalias nocapture readnone %b, i64* noalias nocapture readnone %c) {
+entry:
+  %arrayidx = getelementptr inbounds i64, i64* %a, i64 8
+  %0 = load i64, i64* %arrayidx, align 8
+  %arrayidx3 = getelementptr inbounds i64, i64* %a, i64 16
+  %1 = load i64, i64* %arrayidx3, align 8
+  %mul = mul nsw i64 %1, %0
+  %2 = load i64, i64* %a, align 8
+  %add6 = add nsw i64 %2, %mul
+  store i64 %add6, i64* %a, align 8
+  %arrayidx.1 = getelementptr inbounds i64, i64* %a, i64 9
+  %3 = load i64, i64* %arrayidx.1, align 8
+  %arrayidx3.1 = getelementptr inbounds i64, i64* %a, i64 17
+  %4 = load i64, i64* %arrayidx3.1, align 8
+  %mul.1 = mul nsw i64 %4, %3
+  %arrayidx5.1 = getelementptr inbounds i64, i64* %a, i64 1
+  %5 = load i64, i64* %arrayidx5.1, align 8
+  %add6.1 = add nsw i64 %5, %mul.1
+  store i64 %add6.1, i64* %arrayidx5.1, align 8
+  %arrayidx.2 = getelementptr inbounds i64, i64* %a, i64 10
+  %6 = load i64, i64* %arrayidx.2, align 8
+  %arrayidx3.2 = getelementptr inbounds i64, i64* %a, i64 18
+  %7 = load i64, i64* %arrayidx3.2, align 8
+  %mul.2 = mul nsw i64 %7, %6
+  %arrayidx5.2 = getelementptr inbounds i64, i64* %a, i64 2
+  %8 = load i64, i64* %arrayidx5.2, align 8
+  %add6.2 = add nsw i64 %8, %mul.2
+  store i64 %add6.2, i64* %arrayidx5.2, align 8
+  %arrayidx.3 = getelementptr inbounds i64, i64* %a, i64 11
+  %9 = load i64, i64* %arrayidx.3, align 8
+  %arrayidx3.3 = getelementptr inbounds i64, i64* %a, i64 19
+  %10 = load i64, i64* %arrayidx3.3, align 8
+  %mul.3 = mul nsw i64 %10, %9
+  %arrayidx5.3 = getelementptr inbounds i64, i64* %a, i64 3
+  %11 = load i64, i64* %arrayidx5.3, align 8
+  %add6.3 = add nsw i64 %11, %mul.3
+  store i64 %add6.3, i64* %arrayidx5.3, align 8
+  ret void
+}
Index: llvm/lib/CodeGen/MachineScheduler.cpp
===================================================================
--- llvm/lib/CodeGen/MachineScheduler.cpp
+++ llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1653,7 +1653,14 @@
 
     unsigned ChainPredID = DAG->SUnits.size();
     for (const SDep &Pred : SU.Preds) {
-      if (Pred.isCtrl() && !Pred.isArtificial()) {
+      // We only want to cluster the mem ops that have the same ctrl(non-data)
+      // pred so that they didn't have ctrl dependency for each other. But for
+      // store instrs, we can still cluster them if the pred is load instr.
+      bool CtrlDepOnPred =
+          Pred.isCtrl() &&
+          (IsLoad ||
+           (Pred.getSUnit() && Pred.getSUnit()->getInstr()->mayStore()));
+      if (CtrlDepOnPred && !Pred.isArtificial()) {
         ChainPredID = Pred.getSUnit()->NodeNum;
         break;
       }


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D84139.279123.patch
Type: text/x-patch
Size: 3335 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20200720/f2f63023/attachment.bin>


More information about the llvm-commits mailing list