[llvm-branch-commits] [llvm] [llvm][CodeGen] Address the issue discovered In window scheduling (#101665) (PR #102881)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Aug 12 04:13:19 PDT 2024
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-backend-hexagon
Author: Kai Yan (kaiyan96)
<details>
<summary>Changes</summary>
We have following bugfixes for window scheduler, do we need submit them by ourselves?
* [Added a new restriction for II by pragma in window scheduler](https://github.com/llvm/llvm-project/pull/99448)
* [Fixed a bug in stall cycle calculation for window scheduler](https://github.com/llvm/llvm-project/pull/99451)
* [Added missing initialization failure information for window scheduler](https://github.com/llvm/llvm-project/pull/99449)
* [Fixed max cycle calculation with zero-cost instructions for window scheduler ](https://github.com/llvm/llvm-project/pull/99454)
* [Address the issue of multiple resource reservations In window scheduling](https://github.com/llvm/llvm-project/pull/101665)
---
Full diff: https://github.com/llvm/llvm-project/pull/102881.diff
7 Files Affected:
- (modified) llvm/lib/CodeGen/MachinePipeliner.cpp (+10-2)
- (modified) llvm/lib/CodeGen/WindowScheduler.cpp (+18-11)
- (modified) llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir (+1)
- (added) llvm/test/CodeGen/Hexagon/swp-ws-pragma-initiation-interval-fail.mir (+83)
- (added) llvm/test/CodeGen/Hexagon/swp-ws-resource-reserve.mir (+100)
- (added) llvm/test/CodeGen/Hexagon/swp-ws-stall-cycle.mir (+59)
- (added) llvm/test/CodeGen/Hexagon/swp-ws-zero-cost.mir (+45)
``````````diff
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 497e282bb97682..5c68711ff61938 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -528,8 +528,16 @@ bool MachinePipeliner::useSwingModuloScheduler() {
}
bool MachinePipeliner::useWindowScheduler(bool Changed) {
- // WindowScheduler does not work when it is off or when SwingModuloScheduler
- // is successfully scheduled.
+ // WindowScheduler does not work for following cases:
+ // 1. when it is off.
+ // 2. when SwingModuloScheduler is successfully scheduled.
+ // 3. when pragma II is enabled.
+ if (II_setByPragma) {
+ LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
+ "llvm.loop.pipeline.initiationinterval is set.\n");
+ return false;
+ }
+
return WindowSchedulingOption == WindowSchedulingFlag::WS_Force ||
(WindowSchedulingOption == WindowSchedulingFlag::WS_On && !Changed);
}
diff --git a/llvm/lib/CodeGen/WindowScheduler.cpp b/llvm/lib/CodeGen/WindowScheduler.cpp
index 0777480499e55b..f1658e36ae1e92 100644
--- a/llvm/lib/CodeGen/WindowScheduler.cpp
+++ b/llvm/lib/CodeGen/WindowScheduler.cpp
@@ -232,8 +232,11 @@ bool WindowScheduler::initialize() {
return false;
}
for (auto &Def : MI.all_defs())
- if (Def.isReg() && Def.getReg().isPhysical())
+ if (Def.isReg() && Def.getReg().isPhysical()) {
+ LLVM_DEBUG(dbgs() << "Physical registers are not supported in "
+ "window scheduling!\n");
return false;
+ }
}
if (SchedInstrNum <= WindowRegionLimit) {
LLVM_DEBUG(dbgs() << "There are too few MIs in the window region!\n");
@@ -437,14 +440,17 @@ int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
int PredCycle = getOriCycle(PredMI);
ExpectCycle = std::max(ExpectCycle, PredCycle + (int)Pred.getLatency());
}
- // ResourceManager can be used to detect resource conflicts between the
- // current MI and the previously inserted MIs.
- while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) {
- ++CurCycle;
- if (CurCycle == (int)WindowIILimit)
- return CurCycle;
+ // Zero cost instructions do not need to check resource.
+ if (!TII->isZeroCost(MI.getOpcode())) {
+ // ResourceManager can be used to detect resource conflicts between the
+ // current MI and the previously inserted MIs.
+ while (!RM.canReserveResources(*SU, CurCycle) || CurCycle < ExpectCycle) {
+ ++CurCycle;
+ if (CurCycle == (int)WindowIILimit)
+ return CurCycle;
+ }
+ RM.reserveResources(*SU, CurCycle);
}
- RM.reserveResources(*SU, CurCycle);
OriToCycle[getOriMI(&MI)] = CurCycle;
LLVM_DEBUG(dbgs() << "\tCycle " << CurCycle << " [S."
<< getOriStage(getOriMI(&MI), Offset) << "]: " << MI);
@@ -485,6 +491,7 @@ int WindowScheduler::calculateMaxCycle(ScheduleDAGInstrs &DAG,
// ========================================
int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
int MaxStallCycle = 0;
+ int CurrentII = MaxCycle + 1;
auto Range = getScheduleRange(Offset, SchedInstrNum);
for (auto &MI : Range) {
auto *SU = TripleDAG->getSUnit(&MI);
@@ -492,8 +499,8 @@ int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
for (auto &Succ : SU->Succs) {
if (Succ.isWeak() || Succ.getSUnit() == &TripleDAG->ExitSU)
continue;
- // If the expected cycle does not exceed MaxCycle, no check is needed.
- if (DefCycle + (int)Succ.getLatency() <= MaxCycle)
+ // If the expected cycle does not exceed CurrentII, no check is needed.
+ if (DefCycle + (int)Succ.getLatency() <= CurrentII)
continue;
// If the cycle of the scheduled MI A is less than that of the scheduled
// MI B, the scheduling will fail because the lifetime of the
@@ -503,7 +510,7 @@ int WindowScheduler::calculateStallCycle(unsigned Offset, int MaxCycle) {
if (DefCycle < UseCycle)
return WindowIILimit;
// Get the stall cycle introduced by the register between two trips.
- int StallCycle = DefCycle + (int)Succ.getLatency() - MaxCycle - UseCycle;
+ int StallCycle = DefCycle + (int)Succ.getLatency() - CurrentII - UseCycle;
MaxStallCycle = std::max(MaxStallCycle, StallCycle);
}
}
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir b/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir
index 601b98dca8e20b..be75301b016ed9 100644
--- a/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-fail-2.mir
@@ -3,6 +3,7 @@
# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
# RUN: | FileCheck %s
+# CHECK: Physical registers are not supported in window scheduling!
# CHECK: The WindowScheduler failed to initialize!
---
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-pragma-initiation-interval-fail.mir b/llvm/test/CodeGen/Hexagon/swp-ws-pragma-initiation-interval-fail.mir
new file mode 100644
index 00000000000000..6e69a76290fb1d
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-pragma-initiation-interval-fail.mir
@@ -0,0 +1,83 @@
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s
+# REQUIRES: asserts
+
+# Test that checks no window scheduler is performed if the II set by pragma was
+# enabled
+
+# CHECK: Window scheduling is disabled when llvm.loop.pipeline.initiationinterval is set.
+
+--- |
+ define void @test_pragma_ii_fail(ptr %a0, i32 %a1) {
+ b0:
+ %v0 = icmp sgt i32 %a1, 1
+ br i1 %v0, label %b1, label %b4
+
+ b1: ; preds = %b0
+ %v1 = load i32, ptr %a0, align 4
+ %v2 = add i32 %v1, 10
+ %v4 = add i32 %a1, -1
+ %cgep = getelementptr i32, ptr %a0, i32 1
+ br label %b2
+
+ b2: ; preds = %b2, %b1
+ %v5 = phi i32 [ %v12, %b2 ], [ %v4, %b1 ]
+ %v6 = phi ptr [ %cgep2, %b2 ], [ %cgep, %b1 ]
+ %v7 = phi i32 [ %v10, %b2 ], [ %v2, %b1 ]
+ store i32 %v7, ptr %v6, align 4
+ %v8 = add i32 %v7, 10
+ %cgep1 = getelementptr i32, ptr %v6, i32 -1
+ store i32 %v8, ptr %cgep1, align 4
+ %v10 = add i32 %v7, 10
+ %v12 = add i32 %v5, -1
+ %v13 = icmp eq i32 %v12, 0
+ %cgep2 = getelementptr i32, ptr %v6, i32 1
+ br i1 %v13, label %b4, label %b2, !llvm.loop !0
+
+ b4: ; preds = %b2, %b0
+ ret void
+ }
+
+ !0 = distinct !{!0, !1}
+ !1 = !{!"llvm.loop.pipeline.initiationinterval", i32 2}
+...
+---
+name: test_pragma_ii_fail
+tracksRegLiveness: true
+body: |
+ bb.0.b0:
+ successors: %bb.1(0x40000000), %bb.3(0x40000000)
+ liveins: $r0, $r1
+
+ %0:intregs = COPY $r1
+ %1:intregs = COPY $r0
+ %2:predregs = C2_cmpgti %0, 1
+ J2_jumpf %2, %bb.3, implicit-def dead $pc
+ J2_jump %bb.1, implicit-def dead $pc
+
+ bb.1.b1:
+ successors: %bb.2(0x80000000)
+
+ %3:intregs, %4:intregs = L2_loadri_pi %1, 4
+ %5:intregs = A2_addi killed %3, 10
+ %6:intregs = A2_addi %0, -1
+ %7:intregs = COPY %6
+ J2_loop0r %bb.2, %7, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+
+ bb.2.b2 (machine-block-address-taken):
+ successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+
+ %8:intregs = PHI %4, %bb.1, %9, %bb.2
+ %10:intregs = PHI %5, %bb.1, %11, %bb.2
+ S2_storeri_io %8, 0, %10
+ %11:intregs = A2_addi %10, 10
+ S2_storeri_io %8, -4, %11
+ %9:intregs = A2_addi %8, 4
+ ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+ J2_jump %bb.3, implicit-def dead $pc
+
+ bb.3.b4:
+ PS_jmpret $r31, implicit-def dead $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-resource-reserve.mir b/llvm/test/CodeGen/Hexagon/swp-ws-resource-reserve.mir
new file mode 100644
index 00000000000000..4a9a09c4148cb1
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-resource-reserve.mir
@@ -0,0 +1,100 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: -window-search-ratio=100 -window-search-num=100 -window-diff-limit=1 \
+# RUN: | FileCheck %s
+
+# We want to verify that all three V6_vaddw instructions are emitted in the same cycle.
+# CHECK-LABEL: Current window Offset is 2
+# CHECK: Cycle [[CycleNum:[0-9]+]] [{{S.[0-9]+}}]: {{%[0-9]+}}:hvxvr = V6_vaddw {{%[0-9]+}}:hvxvr, {{%[0-9]+}}:hvxvr
+# CHECK: Cycle [[CycleNum]] [{{S.[0-9]+}}]: {{%[0-9]+}}:hvxvr = V6_vaddw {{%[0-9]+}}:hvxvr, {{%[0-9]+}}:hvxvr
+# CHECK: Cycle [[CycleNum]] [{{S.[0-9]+}}]: {{%[0-9]+}}:hvxvr = V6_vaddw {{%[0-9]+}}:hvxvr, {{%[0-9]+}}:hvxvr
+# CHECK-LABEL: Current window Offset is 3
+
+--- |
+ define void @add_parallel(i32 %N, ptr noalias %x, ptr noalias %y) {
+ entry:
+ %isZeroLength = icmp eq i32 %N, 0
+ br i1 %isZeroLength, label %loop.exit, label %loop.preheader
+
+ loop.preheader: ; preds = %entry
+ %half_splat = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1056964608)
+ %one_splat = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1065353216)
+ %two_splat = tail call <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32 1073741824)
+ br label %loop.body
+
+ loop.exit: ; preds = %loop.body, %entry
+ ret void
+
+ loop.body: ; preds = %loop.body, %loop.preheader
+ %lsr.iv1 = phi ptr [ %cgep2, %loop.body ], [ %x, %loop.preheader ]
+ %lsr.iv = phi ptr [ %cgep1, %loop.body ], [ %y, %loop.preheader ]
+ %index = phi i32 [ 0, %loop.preheader ], [ %index.next, %loop.body ]
+ %vec_x1 = load <32 x i32>, ptr %lsr.iv1, align 128
+ %vec_add_1 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %one_splat, <32 x i32> %vec_x1)
+ %vec_add_2 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %half_splat, <32 x i32> %vec_x1)
+ %vec_add_3 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %two_splat, <32 x i32> %vec_x1)
+ %vec_add_4 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_add_1, <32 x i32> %vec_add_2)
+ %vec_add_5 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_add_1, <32 x i32> %vec_add_3)
+ %vec_add_6 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32> %vec_add_5, <32 x i32> %vec_add_4)
+ store <32 x i32> %vec_add_6, ptr %lsr.iv, align 128
+ %index.next = add nuw i32 %index, 32
+ %continue = icmp ult i32 %index.next, %N
+ %cgep1 = getelementptr i8, ptr %lsr.iv, i32 128
+ %cgep2 = getelementptr i8, ptr %lsr.iv1, i32 128
+ br i1 %continue, label %loop.body, label %loop.exit
+ }
+
+ declare <32 x i32> @llvm.hexagon.V6.lvsplatw.128B(i32)
+ declare <32 x i32> @llvm.hexagon.V6.vaddw.128B(<32 x i32>, <32 x i32>)
+...
+---
+name: add_parallel
+tracksRegLiveness: true
+body: |
+ bb.0.entry:
+ successors: %bb.2(0x30000000), %bb.1(0x50000000)
+ liveins: $r0, $r1, $r2
+
+ %0:intregs = COPY $r2
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r0
+ %3:predregs = C2_cmpeqi %2, 0
+ J2_jumpt killed %3, %bb.2, implicit-def dead $pc
+ J2_jump %bb.1, implicit-def dead $pc
+
+ bb.1.loop.preheader:
+ successors: %bb.3(0x80000000)
+
+ %4:intregs = A2_tfrsi 1056964608
+ %5:hvxvr = V6_lvsplatw killed %4
+ %6:intregs = A2_tfrsi 1065353216
+ %7:hvxvr = V6_lvsplatw killed %6
+ %8:intregs = A2_tfrsi 1073741824
+ %9:hvxvr = V6_lvsplatw killed %8
+ %10:intregs = A2_addi %2, 31
+ %11:intregs = S2_lsr_i_r %10, 5
+ %12:intregs = COPY %11
+ J2_loop0r %bb.3, %12, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+ J2_jump %bb.3, implicit-def dead $pc
+
+ bb.2.loop.exit:
+ PS_jmpret $r31, implicit-def dead $pc
+
+ bb.3.loop.body (machine-block-address-taken):
+ successors: %bb.3(0x7c000000), %bb.2(0x04000000)
+
+ %13:intregs = PHI %1, %bb.1, %14, %bb.3
+ %15:intregs = PHI %0, %bb.1, %16, %bb.3
+ %17:hvxvr, %14:intregs = V6_vL32b_pi %13, 128 :: (load (s1024) from %ir.lsr.iv1)
+ %18:hvxvr = V6_vaddw %7, %17
+ %19:hvxvr = V6_vaddw %5, %17
+ %20:hvxvr = V6_vaddw %9, %17
+ %21:hvxvr = V6_vaddw %18, killed %19
+ %22:hvxvr = V6_vaddw %18, killed %20
+ %23:hvxvr = V6_vaddw killed %22, killed %21
+ %16:intregs = V6_vS32b_pi %15, 128, killed %23 :: (store (s1024) into %ir.lsr.iv)
+ ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+ J2_jump %bb.2, implicit-def dead $pc
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-stall-cycle.mir b/llvm/test/CodeGen/Hexagon/swp-ws-stall-cycle.mir
new file mode 100644
index 00000000000000..ddba67d78eb58c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-stall-cycle.mir
@@ -0,0 +1,59 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs \
+# RUN: -window-region-limit=1 -window-search-ratio=100 -window-diff-limit=0 \
+# RUN: 2>&1 | FileCheck %s
+
+# CHECK-LABEL: Start analyzing II
+# CHECK: MaxStallCycle is 0
+# CHECK-LABEL: Start analyzing II
+# CHECK: MaxStallCycle is 0
+# CHECK-LABEL: Start analyzing II
+# CHECK: MaxStallCycle is 0
+
+---
+name: test_window_stall_cycle
+tracksRegLiveness: true
+body: |
+ bb.0:
+ successors: %bb.3(0x40000000), %bb.1(0x40000000)
+ liveins: $r0, $r1
+
+ %0:intregs = COPY $r1
+ %1:intregs = COPY $r0
+ %2:intregs = nsw A2_add %0, %1
+ %3:intregs = S2_lsr_i_r_acc %2, %2, 31
+ %4:intregs = S2_asr_i_r killed %3, 1
+ %5:predregs = C2_cmpgt %1, %4
+ %6:intregs = A2_tfrsi 0
+ J2_jumpt killed %5, %bb.3, implicit-def dead $pc
+ J2_jump %bb.1, implicit-def dead $pc
+
+ bb.1:
+ successors: %bb.2(0x80000000)
+
+ %7:intregs = A2_addi %4, 2
+ %8:intregs = A2_tfrsi 0
+ %9:intregs = A2_sub %4, %1
+ %10:intregs = A2_addi %9, 1
+ %11:intregs = COPY %10
+ J2_loop0r %bb.2, %11, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+
+ bb.2 (machine-block-address-taken):
+ successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+
+ %12:intregs = PHI %7, %bb.1, %13, %bb.2
+ %14:intregs = PHI %8, %bb.1, %15, %bb.2
+ %16:intregs = PHI %8, %bb.1, %17, %bb.2
+ %18:intregs, %13:intregs = L2_loadri_pi %12, -4
+ %17:intregs = nsw A2_add killed %18, %16
+ %15:intregs = A2_max %17, %14
+ ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+ J2_jump %bb.3, implicit-def dead $pc
+
+ bb.3:
+ %19:intregs = PHI %6, %bb.0, %15, %bb.2
+ $r0 = COPY %19
+ PS_jmpret $r31, implicit-def dead $pc, implicit $r0
+
+...
diff --git a/llvm/test/CodeGen/Hexagon/swp-ws-zero-cost.mir b/llvm/test/CodeGen/Hexagon/swp-ws-zero-cost.mir
new file mode 100644
index 00000000000000..ecf49a83c69e15
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/swp-ws-zero-cost.mir
@@ -0,0 +1,45 @@
+# REQUIRES: asserts
+# RUN: llc --march=hexagon %s -run-pass=pipeliner -debug-only=pipeliner \
+# RUN: -window-sched=force -filetype=null -verify-machineinstrs 2>&1 \
+# RUN: | FileCheck %s
+
+# CHECK-NOT: Can't find a valid II. Keep searching...
+# CHECK: Start analyzing II
+# CHECK: Start scheduling Phis
+# CHECK: Current window Offset is {{[0-9]+}} and II is {{[0-9]+}}
+
+---
+name: relu
+tracksRegLiveness: true
+body: |
+ bb.0:
+ successors: %bb.2(0x30000000), %bb.1(0x50000000)
+ liveins: $r0, $r1, $r2
+ %0:intregs = COPY $r2
+ %1:intregs = COPY $r1
+ %2:intregs = COPY $r0
+ %3:predregs = C2_cmpeqi %2, 0
+ J2_jumpt killed %3, %bb.2, implicit-def dead $pc
+ J2_jump %bb.1, implicit-def dead $pc
+ bb.1:
+ successors: %bb.3(0x80000000)
+ %4:hvxvr = V6_vd0
+ %5:intregs = A2_addi %2, 31
+ %6:intregs = S2_lsr_i_r %5, 5
+ %7:intregs = COPY %6
+ J2_loop0r %bb.3, %7, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+ J2_jump %bb.3, implicit-def dead $pc
+ bb.2:
+ PS_jmpret $r31, implicit-def dead $pc
+ bb.3 (machine-block-address-taken):
+ successors: %bb.3(0x7c000000), %bb.2(0x04000000)
+ %8:intregs = PHI %1, %bb.1, %9, %bb.3
+ %10:intregs = PHI %0, %bb.1, %14, %bb.3
+ %11:hvxvr, %9:intregs = V6_vL32b_pi %8, 128
+ %12:intregs = COPY %10
+ %13:hvxvr = V6_vmaxw killed %11, %4
+ %14:intregs = V6_vS32b_pi %12, 128, killed %13
+ ENDLOOP0 %bb.3, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+ J2_jump %bb.2, implicit-def dead $pc
+...
+
``````````
</details>
https://github.com/llvm/llvm-project/pull/102881
More information about the llvm-branch-commits
mailing list