[llvm-dev] [PATCH] D69437: [RAGreedy] Enable -consider-local-interval-cost by default
Sanne Wouda via llvm-dev
llvm-dev at lists.llvm.org
Mon Oct 28 09:12:45 PDT 2019
Hi all,
We're planning to turn on -consider-local-interval-cost for all targets which fixes some sub-optimal register allocation in certain cases. Since this is a target-independent change, we'd like to give people the opportunity to run their own numbers or raise any concerns.
The option enables a more accurate consideration of local interval costs when selecting a split candidate. It is already enabled on X86 and we've seen the same issue (see below) on AArch64. We expect that this would also be a (latent) issue for other targets so enabling the option for all seems like the right thing to do.
The tl;dr is that turning on this option has a small impact on compile time and shows some positives and negatives on some individual benchmarks but no change in geomean for both SPEC2017 and the LLVM test suite on AArch64. The full details are below and on https://reviews.llvm.org/D69437. The commit that added -consider-local-interval-cost is here https://reviews.llvm.org/rL323870.
Please let us know what you think.
Cheers,
Sanne
________________________________
From: Sanne Wouda via Phabricator <reviews at reviews.llvm.org>
Sent: 25 October 2019 18:53
To: Sanne Wouda <Sanne.Wouda at arm.com>
Cc: matze at braunis.de <matze at braunis.de>; quentin.colombet at gmail.com <quentin.colombet at gmail.com>; Kristof Beyls <Kristof.Beyls at arm.com>; hiraditya at msn.com <hiraditya at msn.com>; llvm-commits at lists.llvm.org <llvm-commits at lists.llvm.org>; t.p.northover at gmail.com <t.p.northover at gmail.com>; mcrosier at codeaurora.org <mcrosier at codeaurora.org>; john.reagan at vmssoftware.com <john.reagan at vmssoftware.com>; jji at us.ibm.com <jji at us.ibm.com>; yuanfang.chen at sony.com <yuanfang.chen at sony.com>; David Green <David.Green at arm.com>
Subject: [PATCH] D69437: [RAGreedy] Enable -consider-local-interval-cost by default
sanwou01 created this revision.
Herald added subscribers: llvm-commits, hiraditya, kristof.beyls, qcolombet, MatzeB.
Herald added a project: LLVM.
sanwou01 added reviewers: SjoerdMeijer, samparker, dmgreen.
The greedy register allocator occasionally decides to insert a large number of
unnecessary copies, see below for an example. The -consider-local-interval-cost
option (which X86 already enables by default) fixes this. We enable this option
for all target backends. To turn the new default behaviour off, use
-consider-local-interval-cost=false.
We evaluated the impact of this change on compile time, code size and
performance benchmarks.
This option has a small impact on compile time, measured on CTMark. A 0.1%
geomean regression on -O1 and -O2, and 0.2% geomean for -O3, with at most 0.5%
on individual benchmarks.
The effect on both code size and performance on AArch64 for the LLVM test suite
is nil on the geomean with individual outliers (ignoring short exec_times)
between:
best worst
size..text -3.3% +0.0%
exec_time -5.8% +2.3%
On SPEC CPU® 2017 (compiled for AArch64) there is a minor reduction (-0.2% at
most) in code size on some benchmarks, with a tiny movement (-0.01%) on the
geomean. Neither intrate nor fprate show any change in performance.
This patch makes the following changes.
- For all targets, enableAdvancedRASplitCost() returns true. Individual targets can still override enableAdvancedRASplitCost() to force the previous behaviour.
- Remove X86's now unncessary override.
- Ensures that -consider-local-interval-cost=false overrides the new default behaviour.
This matrix multiply example:
$ cat test.c
long A[8][8];
long B[8][8];
long C[8][8];
void run_test() {
for (int k = 0; k < 8; k++) {
for (int i = 0; i < 8; i++) {
for (int j = 0; j < 8; j++) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
results in the following generated code on AArch64:
$ clang --target=aarch64-arm-none-eabi -O3 -S test.c -o -
[...]
// %for.cond1.preheader
// =>This Inner Loop Header: Depth=1
add x14, x11, x9
str q0, [sp, #16] // 16-byte Folded Spill
ldr q0, [x14]
mov v2.16b, v15.16b
mov v15.16b, v14.16b
mov v14.16b, v13.16b
mov v13.16b, v12.16b
mov v12.16b, v11.16b
mov v11.16b, v10.16b
mov v10.16b, v9.16b
mov v9.16b, v8.16b
mov v8.16b, v31.16b
mov v31.16b, v30.16b
mov v30.16b, v29.16b
mov v29.16b, v28.16b
mov v28.16b, v27.16b
mov v27.16b, v26.16b
mov v26.16b, v25.16b
mov v25.16b, v24.16b
mov v24.16b, v23.16b
mov v23.16b, v22.16b
mov v22.16b, v21.16b
mov v21.16b, v20.16b
mov v20.16b, v19.16b
mov v19.16b, v18.16b
mov v18.16b, v17.16b
mov v17.16b, v16.16b
mov v16.16b, v7.16b
mov v7.16b, v6.16b
mov v6.16b, v5.16b
mov v5.16b, v4.16b
mov v4.16b, v3.16b
mov v3.16b, v1.16b
mov x12, v0.d[1]
fmov x15, d0
ldp q1, q0, [x14, #16]
ldur x1, [x10, #-256]
ldur x2, [x10, #-192]
add x9, x9, #64 // =64
mov x13, v1.d[1]
fmov x16, d1
ldr q1, [x14, #48]
mul x3, x15, x1
mov x14, v0.d[1]
fmov x17, d0
mov x18, v1.d[1]
fmov x0, d1
mov v1.16b, v3.16b
mov v3.16b, v4.16b
mov v4.16b, v5.16b
mov v5.16b, v6.16b
mov v6.16b, v7.16b
mov v7.16b, v16.16b
mov v16.16b, v17.16b
mov v17.16b, v18.16b
mov v18.16b, v19.16b
mov v19.16b, v20.16b
mov v20.16b, v21.16b
mov v21.16b, v22.16b
mov v22.16b, v23.16b
mov v23.16b, v24.16b
mov v24.16b, v25.16b
mov v25.16b, v26.16b
mov v26.16b, v27.16b
mov v27.16b, v28.16b
mov v28.16b, v29.16b
mov v29.16b, v30.16b
mov v30.16b, v31.16b
mov v31.16b, v8.16b
mov v8.16b, v9.16b
mov v9.16b, v10.16b
mov v10.16b, v11.16b
mov v11.16b, v12.16b
mov v12.16b, v13.16b
mov v13.16b, v14.16b
mov v14.16b, v15.16b
mov v15.16b, v2.16b
ldr q2, [sp] // 16-byte Folded Reload
fmov d0, x3
mul x3, x12, x1
[...]
With -consider-local-interval-cost the same section of code results in the
following:
$ clang --target=aarch64-arm-none-eabi -mllvm -consider-local-interval-cost -O3 -S test.c -o -
[...]
.LBB0_1: // %for.cond1.preheader
// =>This Inner Loop Header: Depth=1
add x14, x11, x9
ldp q0, q1, [x14]
ldur x1, [x10, #-256]
ldur x2, [x10, #-192]
add x9, x9, #64 // =64
mov x12, v0.d[1]
fmov x15, d0
mov x13, v1.d[1]
fmov x16, d1
ldp q0, q1, [x14, #32]
mul x3, x15, x1
cmp x9, #512 // =512
mov x14, v0.d[1]
fmov x17, d0
fmov d0, x3
mul x3, x12, x1
[...]
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D69437
Files:
llvm/lib/CodeGen/RegAllocGreedy.cpp
llvm/lib/CodeGen/TargetSubtargetInfo.cpp
llvm/lib/Target/X86/X86Subtarget.h
llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20191028/63ca09d4/attachment.html>
More information about the llvm-dev
mailing list