[all-commits] [llvm/llvm-project] f649f2: [RAGreedy] Enable -consider-local-interval-cost fo...
Sanne Wouda via All-commits
all-commits at lists.llvm.org
Fri Nov 8 02:20:49 PST 2019
Branch: refs/heads/master
Home: https://github.com/llvm/llvm-project
Commit: f649f24d388c745d20fab5573d27b822b92818ed
https://github.com/llvm/llvm-project/commit/f649f24d388c745d20fab5573d27b822b92818ed
Author: Sanne Wouda <Sanne.Wouda at arm.com>
Date: 2019-11-08 (Fri, 08 Nov 2019)
Changed paths:
M llvm/lib/CodeGen/RegAllocGreedy.cpp
M llvm/lib/Target/AArch64/AArch64Subtarget.h
M llvm/test/CodeGen/AArch64/ragreedy-local-interval-cost.ll
Log Message:
-----------
[RAGreedy] Enable -consider-local-interval-cost for AArch64
Summary:
The greedy register allocator occasionally decides to insert a large number of
unnecessary copies, see below for an example. The -consider-local-interval-cost
option (which X86 already enables by default) fixes this. We enable this option
for AArch64 only after receiving feedback that this change is not beneficial for
PowerPC.
We evaluated the impact of this change on compile time, code size and
performance benchmarks.
This option has a small impact on compile time, measured on CTMark. A 0.1%
geomean regression on -O1 and -O2, and 0.2% geomean for -O3, with at most 0.5%
on individual benchmarks.
The effect on both code size and performance on AArch64 for the LLVM test suite
is nil on the geomean with individual outliers (ignoring short exec_times)
between:
best worst
size..text -3.3% +0.0%
exec_time -5.8% +2.3%
On SPEC CPU® 2017 (compiled for AArch64) there is a minor reduction (-0.2% at
most) in code size on some benchmarks, with a tiny movement (-0.01%) on the
geomean. Neither intrate nor fprate show any change in performance.
This patch makes the following changes.
- For the AArch64 target, enableAdvancedRASplitCost() now returns true.
- Ensures that -consider-local-interval-cost=false can disable the new
behaviour if necessary.
This matrix multiply example:
$ cat test.c
long A[8][8];
long B[8][8];
long C[8][8];
void run_test() {
for (int k = 0; k < 8; k++) {
for (int i = 0; i < 8; i++) {
for (int j = 0; j < 8; j++) {
C[i][j] += A[i][k] * B[k][j];
}
}
}
}
results in the following generated code on AArch64:
$ clang --target=aarch64-arm-none-eabi -O3 -S test.c -o -
[...]
// %for.cond1.preheader
// =>This Inner Loop Header: Depth=1
add x14, x11, x9
str q0, [sp, #16] // 16-byte Folded Spill
ldr q0, [x14]
mov v2.16b, v15.16b
mov v15.16b, v14.16b
mov v14.16b, v13.16b
mov v13.16b, v12.16b
mov v12.16b, v11.16b
mov v11.16b, v10.16b
mov v10.16b, v9.16b
mov v9.16b, v8.16b
mov v8.16b, v31.16b
mov v31.16b, v30.16b
mov v30.16b, v29.16b
mov v29.16b, v28.16b
mov v28.16b, v27.16b
mov v27.16b, v26.16b
mov v26.16b, v25.16b
mov v25.16b, v24.16b
mov v24.16b, v23.16b
mov v23.16b, v22.16b
mov v22.16b, v21.16b
mov v21.16b, v20.16b
mov v20.16b, v19.16b
mov v19.16b, v18.16b
mov v18.16b, v17.16b
mov v17.16b, v16.16b
mov v16.16b, v7.16b
mov v7.16b, v6.16b
mov v6.16b, v5.16b
mov v5.16b, v4.16b
mov v4.16b, v3.16b
mov v3.16b, v1.16b
mov x12, v0.d[1]
fmov x15, d0
ldp q1, q0, [x14, #16]
ldur x1, [x10, #-256]
ldur x2, [x10, #-192]
add x9, x9, #64 // =64
mov x13, v1.d[1]
fmov x16, d1
ldr q1, [x14, #48]
mul x3, x15, x1
mov x14, v0.d[1]
fmov x17, d0
mov x18, v1.d[1]
fmov x0, d1
mov v1.16b, v3.16b
mov v3.16b, v4.16b
mov v4.16b, v5.16b
mov v5.16b, v6.16b
mov v6.16b, v7.16b
mov v7.16b, v16.16b
mov v16.16b, v17.16b
mov v17.16b, v18.16b
mov v18.16b, v19.16b
mov v19.16b, v20.16b
mov v20.16b, v21.16b
mov v21.16b, v22.16b
mov v22.16b, v23.16b
mov v23.16b, v24.16b
mov v24.16b, v25.16b
mov v25.16b, v26.16b
mov v26.16b, v27.16b
mov v27.16b, v28.16b
mov v28.16b, v29.16b
mov v29.16b, v30.16b
mov v30.16b, v31.16b
mov v31.16b, v8.16b
mov v8.16b, v9.16b
mov v9.16b, v10.16b
mov v10.16b, v11.16b
mov v11.16b, v12.16b
mov v12.16b, v13.16b
mov v13.16b, v14.16b
mov v14.16b, v15.16b
mov v15.16b, v2.16b
ldr q2, [sp] // 16-byte Folded Reload
fmov d0, x3
mul x3, x12, x1
[...]
With -consider-local-interval-cost the same section of code results in the
following:
$ clang --target=aarch64-arm-none-eabi -mllvm -consider-local-interval-cost -O3 -S test.c -o -
[...]
.LBB0_1: // %for.cond1.preheader
// =>This Inner Loop Header: Depth=1
add x14, x11, x9
ldp q0, q1, [x14]
ldur x1, [x10, #-256]
ldur x2, [x10, #-192]
add x9, x9, #64 // =64
mov x12, v0.d[1]
fmov x15, d0
mov x13, v1.d[1]
fmov x16, d1
ldp q0, q1, [x14, #32]
mul x3, x15, x1
cmp x9, #512 // =512
mov x14, v0.d[1]
fmov x17, d0
fmov d0, x3
mul x3, x12, x1
[...]
Reviewers: SjoerdMeijer, samparker, dmgreen, qcolombet
Reviewed By: dmgreen
Subscribers: ZhangKang, jsji, wuzish, ppc-slack, lkail, steven.zhang, MatzeB, qcolombet, kristof.beyls, hiraditya, llvm-commits
Tags: #llvm
Differential Revision: https://reviews.llvm.org/D69437
More information about the All-commits
mailing list