[LLVMdev] How to remove a pesky store?
Fernando Magno Quintao Pereira
pronesto at gmail.com
Thu Apr 16 07:49:25 PDT 2015
Dear LLVMers,
I am stumbling upon a pesky store instruction that maybe you could
hoist out of a loop. So, I am reporting my "story" here, to either
understand why the store must stay, or warn you that perhaps you could
improve some of your optimizations to hoist/sunk it away. At the end
of this e-mail, I am sending you a benchmark that has four different
versions of the following kernel, which performs the sum of prefixes:
void prefix_sum_orig(struct array *src, struct array *dst) {
int i, j;
for (i = 0; i < src->s; i++) {
dst->v[i] = 0;
for (j = 0; j < i; j++) {
dst->v[i] += src->v[j];
}
}
}
The second and third versions are the kernels that we get with
clang -O3, if we using "restrict" in the arguments:
void prefix_sum_rest(struct array src[restrict], struct array dst[restrict]) {
int i, j;
for (i = 0; i < src->s; i++) {
dst->v[i] = 0;
for (j = 0; j < i; j++) {
dst->v[i] += src->v[j];
}
}
}
At -O3, this kernel is more-or-less equivalent to this one below,
which is the third version in the benchmark:
void prefix_sum_o3(struct array *src, struct array *dst) {
int i, j;
int size = src->s;
if (0 < size) {
int* dst_v = dst->v;
i = 0;
do {
int tmp = 0;
j = 0;
if (j < i) {
int* src_v = src->v;
do {
tmp += src_v[j];
dst_v[i] = tmp;
j++;
} while (j < i);
}
i++;
} while (i < size);
}
}
Both these kernels, prefix_sum_rest and prefix_sum_o3, have very
similar runtime. Look at how LLVM has been able to hoist or sink
almost every load. However, the store into dst_v[i] remains at the
innermost loop. Why has LLVM not moved that store outside the
innermost loop? I have not found any reason why it would not do
otherwise. If we perform this optimization, than the gains are
substantial in this example. Below is the kernel that I wish LLVM
could produce:
void prefix_sum_fer(struct array *src, struct array *dst) {
int i, j;
int size = src->s;
if (0 < size) {
int* dst_v = dst->v;
i = 0;
do {
int tmp = 0;
j = 0;
if (j < i) {
int* src_v = src->v;
do {
tmp += src_v[j];
j++;
} while (j < i);
}
dst_v[i] = tmp;
i++;
} while (i < size);
}
}
And below is the commands that I use to benchmark these tests on
an Intel 1.4GHz Core i5 with 4 GH, 1,600 MHz DDR3 running OSX 10.9.5:
$> clang -v
clang version 3.6.0 (trunk 224772)
Target: x86_64-apple-darwin13.4.0
$> clang -c -emit-llvm a.c -o a.bc ; opt -O3 -disable-inlining a.bc -o
a.opt ; llc a.opt -o a.s ; clang a.s -o a.out
$> ./a.out 100000 100 1 ; ./a.out 100000 100 2 ; ./a.out 100000 100 3
; ./a.out 100000 100 4
Kernel orig
Time spent = 13.248573
Final answer = -290229404
Kernel o3
Time spent = 2.513803
Final answer = -290229404
Kernel restrict
Time spent = 2.483664
Final answer = -290229404
Kernel fer
Time spent = 0.622676
Final answer = -290229404
If I do not disable inlining, then there is no difference between
orig, o3 and restrict; however fer is still the fastest:
$> ./a.out 100000 100 1 ; ./a.out 100000 100 2 ; ./a.out 100000 100 3
; ./a.out 100000 100 4
Kernel orig
Time spent = 2.491038
Final answer = -290229404
Kernel o3
Time spent = 2.481961
Final answer = -290229404
Kernel restrict
Time spent = 2.482067
Final answer = -290229404
Kernel fer
Time spent = 0.626108
Final answer = -290229404
The benchmark that I have used in embedded in the text below:
--------- Begin Bench -----------
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
/*
Benchmark that shows the annoying store in the innermost loop.
author: fernando at dcc.ufmg.br
date: Apr 16th 2015
*/
struct array {
int s;
int* v;
};
struct array* new_array(int N) {
struct array *a = (struct array*)malloc(sizeof(struct array*));
a->v = (int*)malloc(N * sizeof(int));
a->s = N;
return a;
}
void fill_array(struct array *a, int max) {
const int up = 32767;
int i;
for (i = 0; i < a->s; i++) {
a->v[i] = i * up % max;
}
}
void print_array(struct array *a) {
int i;
for (i = 0; i < a->s; i++) {
if (i % 10 == 0) {
printf("\n");
}
printf("%8d", a->v[i]);
}
printf("\n");
}
// This is the original kernel, which performs a sum of prefixes.
void prefix_sum_orig(struct array *src, struct array *dst) {
int i, j;
for (i = 0; i < src->s; i++) {
dst->v[i] = 0;
for (j = 0; j < i; j++) {
dst->v[i] += src->v[j];
}
}
}
// This is, more or less, the kernel that we get with restrict and -O3:
void prefix_sum_o3(struct array *src, struct array *dst) {
int i, j;
int size = src->s;
if (0 < size) {
int* dst_v = dst->v;
i = 0;
do {
int tmp = 0;
j = 0;
if (j < i) {
int* src_v = src->v;
do {
tmp += src_v[j];
dst_v[i] = tmp;
j++;
} while (j < i);
}
i++;
} while (i < size);
}
}
// Just to show that prefix_sum_o3 is sound, here I have the same kernel, with
// the "restrict" keywork.
void prefix_sum_rest(struct array src[restrict], struct array dst[restrict]) {
int i, j;
for (i = 0; i < src->s; i++) {
dst->v[i] = 0;
for (j = 0; j < i; j++) {
dst->v[i] += src->v[j];
}
}
}
// Here is the kernel that I wish I could get with LLVM -O3 + restrict. I
// wish the store in the innermost loop could go away.
void prefix_sum_fer(struct array *src, struct array *dst) {
int i, j;
int size = src->s;
if (0 < size) {
int* dst_v = dst->v;
i = 0;
do {
int tmp = 0;
j = 0;
if (j < i) {
int* src_v = src->v;
do {
tmp += src_v[j];
j++;
} while (j < i);
}
dst_v[i] = tmp;
i++;
} while (i < size);
}
}
// This function is just to check if the kernels are producing equivalent
// results.
int sum_array(struct array *src) {
int sum = 0, i;
int* src_v = src->v;
int size = src->s;
for (i = 0; i < size; i++) {
sum += src_v[i];
}
return sum;
}
int main(int argc, char** argv) {
if (argc < 3) {
printf("Syntax: %s <array_size> <max_elem> <kernel>\n", argv[0]);
return 1;
} else {
clock_t begin, end;
double time_spent;
struct array *a0, *a1;
const int N = atoi(argv[1]);
const int M = atoi(argv[2]);
const int K = atoi(argv[3]);
printf("\n");
a0 = new_array(N);
a1 = new_array(N);
fill_array(a0, M);
begin = clock();
switch(K) {
case 1: printf("Kernel orig\n"); prefix_sum_orig(a0, a1); break;
case 2: printf("Kernel o3\n"); prefix_sum_o3(a0, a1); break;
case 3: printf("Kernel restrict\n"); prefix_sum_rest(a0, a1); break;
case 4: printf("Kernel fer\n"); prefix_sum_fer(a0, a1); break;
}
end = clock();
time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
printf("Time spent = %lf\n", time_spent);
printf("Final answer = %d\n", sum_array(a1));
}
}
--------- End Bench -----------
Regards,
Fernando
More information about the llvm-dev
mailing list