[llvm-dev] Machine Scheduler on Power PC: Latency Limit and Register Pressure
Stefan Pintilie via llvm-dev
llvm-dev at lists.llvm.org
Fri Oct 13 13:09:23 PDT 2017
Hi,
I've been looking at the Machine Scheduler on Power PC. I am looking only
at the pre-RA machine scheduler and I am running it in the default
bi-directional mode (so, both top down and bottom up queues are
considered). I've come across an example where the scheduler picks a poor
ordering for the instructions which results in very high register pressure
which results in spills. The problem comes from the fact that the Machine
Scheduler uses a maximum latency limit when it considers instructions to
schedule. A high latency instruction will not be scheduled before all of
the available lower latency instructions are scheduled. This happens
regardless of register pressure since the higher latency instruction is
not even added to the "Available" queue that is used when the heuristics
pick an instruction to schedule next.
My question is: Why do we have that latency limit in the first place? If
an instruction can be scheduled (ie all the instructions it depends on are
already scheduled) shouldn't it be at least considered?
The example is listed below:
test.c
--
long A[100];
long func(long* num, long* den) {
// This loop is unrolled
for (int i=0; i<6; i++) {
A[i] = num[i] / den[i];
}
return 0;
}
--
Compile commands
--
clang -c -m64 -O3 -target powerpc64le-unknown-linux-gnu -mcpu=pwr9
-fexperimental-new-pass-manager test.c -S -emit-llvm
llc test.ll -O3 -ppc-asm-full-reg-names -debug-only=machine-scheduler -o
test-p9.s > listing.out 2>&1
--
Looking at the listing.out file I've noticed that all of the loads are
grouped together at the start of the function. Those loads use 12
registers before any of the divides are scheduled. As a result, we end up
with significantly higher register pressure after all the loads.
--
0B BB#0: derived from LLVM BB %entry
Live Ins: %X3 %X4
16B %vreg1<def> = COPY %X4; G8RC_and_G8RC_NOX0:%vreg1
32B %vreg0<def> = COPY %X3; G8RC_and_G8RC_NOX0:%vreg0
48B %vreg2<def> = LD 0, %vreg0; mem:LD8[%num](tbaa=!4)
G8RC:%vreg2 G8RC_and_G8RC_NOX0:%vreg0
64B %vreg3<def> = LD 0, %vreg1; mem:LD8[%den](tbaa=!4)
G8RC:%vreg3 G8RC_and_G8RC_NOX0:%vreg1
144B %vreg7<def> = LD 8, %vreg0; mem:LD8[%arrayidx.1](tbaa=!4)
G8RC:%vreg7 G8RC_and_G8RC_NOX0:%vreg0
160B %vreg8<def> = LD 8, %vreg1; mem:LD8[%arrayidx2.1](tbaa=!4)
G8RC:%vreg8 G8RC_and_G8RC_NOX0:%vreg1
208B %vreg10<def> = LD 16, %vreg0;
mem:LD8[%arrayidx.2](tbaa=!4) G8RC:%vreg10 G8RC_and_G8RC_NOX0:%vreg0
224B %vreg11<def> = LD 16, %vreg1;
mem:LD8[%arrayidx2.2](tbaa=!4) G8RC:%vreg11 G8RC_and_G8RC_NOX0:%vreg1
272B %vreg13<def> = LD 24, %vreg0;
mem:LD8[%arrayidx.3](tbaa=!4) G8RC:%vreg13 G8RC_and_G8RC_NOX0:%vreg0
288B %vreg14<def> = LD 24, %vreg1;
mem:LD8[%arrayidx2.3](tbaa=!4) G8RC:%vreg14 G8RC_and_G8RC_NOX0:%vreg1
336B %vreg16<def> = LD 32, %vreg0;
mem:LD8[%arrayidx.4](tbaa=!4) G8RC:%vreg16 G8RC_and_G8RC_NOX0:%vreg0
352B %vreg17<def> = LD 32, %vreg1;
mem:LD8[%arrayidx2.4](tbaa=!4) G8RC:%vreg17 G8RC_and_G8RC_NOX0:%vreg1
400B %vreg19<def> = LD 40, %vreg0;
mem:LD8[%arrayidx.5](tbaa=!4) G8RC:%vreg19 G8RC_and_G8RC_NOX0:%vreg0
416B %vreg20<def> = LD 40, %vreg1;
mem:LD8[%arrayidx2.5](tbaa=!4) G8RC:%vreg20 G8RC_and_G8RC_NOX0:%vreg1
424B %vreg4<def> = DIVD %vreg2, %vreg3;
G8RC:%vreg4,%vreg2,%vreg3
432B %vreg9<def> = DIVD %vreg7, %vreg8;
G8RC:%vreg9,%vreg7,%vreg8
440B %vreg12<def> = DIVD %vreg10, %vreg11;
G8RC:%vreg12,%vreg10,%vreg11
448B %vreg15<def> = DIVD %vreg13, %vreg14;
G8RC:%vreg15,%vreg13,%vreg14
456B %vreg18<def> = DIVD %vreg16, %vreg17;
G8RC:%vreg18,%vreg16,%vreg17
464B %vreg21<def> = DIVD %vreg19, %vreg20;
G8RC:%vreg21,%vreg19,%vreg20
472B %vreg5<def> = ADDIStocHA %X2, <ga:@A>;
G8RC_and_G8RC_NOX0:%vreg5
480B %vreg6<def> = LDtocL <ga:@A>, %vreg5, %X2<imp-use>;
mem:LD8[GOT] G8RC_and_G8RC_NOX0:%vreg6,%vreg5
504B %X3<def> = LI8 0
512B STD %vreg4, 0, %vreg6; mem:ST8[getelementptr inbounds
([100 x i64], [100 x i64]* @A, i64 0, i64 0)](tbaa=!4) G8RC:%vreg4
G8RC_and_G8RC_NOX0:%vreg6
520B STD %vreg9, 8, %vreg6; mem:ST8[getelementptr inbounds
([100 x i64], [100 x i64]* @A, i64 0, i64 1)](tbaa=!4) G8RC:%vreg9
G8RC_and_G8RC_NOX0:%vreg6
528B STD %vreg12, 16, %vreg6; mem:ST8[getelementptr inbounds
([100 x i64], [100 x i64]* @A, i64 0, i64 2)](tbaa=!4) G8RC:%vreg12
G8RC_and_G8RC_NOX0:%vreg6
536B STD %vreg15, 24, %vreg6; mem:ST8[getelementptr inbounds
([100 x i64], [100 x i64]* @A, i64 0, i64 3)](tbaa=!4) G8RC:%vreg15
G8RC_and_G8RC_NOX0:%vreg6
544B STD %vreg18, 32, %vreg6; mem:ST8[getelementptr inbounds
([100 x i64], [100 x i64]* @A, i64 0, i64 4)](tbaa=!4) G8RC:%vreg18
G8RC_and_G8RC_NOX0:%vreg6
552B STD %vreg21, 40, %vreg6; mem:ST8[getelementptr inbounds
([100 x i64], [100 x i64]* @A, i64 0, i64 5)](tbaa=!4) G8RC:%vreg21
G8RC_and_G8RC_NOX0:%vreg6
560B BLR8 %LR8<imp-use>, %RM<imp-use>, %X3<imp-use>
--
Due to all of the register pressure built up by those loads we are forced
to spill. Here is the final assembly.
--
# BB#0: # %entry
std r30, -16(r1) # 8-byte Folded Spill
ld r5, 0(r3)
ld r6, 0(r4)
ld r7, 8(r3)
ld r8, 8(r4)
ld r9, 16(r3)
ld r10, 16(r4)
ld r11, 24(r3)
ld r0, 32(r3)
ld r12, 24(r4)
ld r30, 32(r4)
ld r3, 40(r3)
ld r4, 40(r4)
divd r5, r5, r6
divd r6, r7, r8
divd r7, r9, r10
divd r9, r0, r30
divd r4, r3, r4
divd r8, r11, r12
addis r3, r2, .LC0 at toc@ha
ld r30, -16(r1) # 8-byte Folded Reload
ld r10, .LC0 at toc@l(r3)
li r3, 0
std r5, 0(r10)
std r6, 8(r10)
std r7, 16(r10)
std r9, 32(r10)
std r8, 24(r10)
std r4, 40(r10)
blr
--
Thank you,
Stefan Pintilie
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20171013/72eaf14d/attachment.html>
More information about the llvm-dev
mailing list