[LLVMdev] How to remove a pesky store?

Thu Apr 16 07:49:25 PDT 2015

Dear LLVMers,

    I am stumbling upon a pesky store instruction that maybe you could
hoist out of a loop. So, I am reporting my "story" here, to either
understand why the store must stay, or warn you that perhaps you could
improve some of your optimizations to hoist/sunk it away. At the end
of this e-mail, I am sending you a benchmark that has four different
versions of the following kernel, which performs the sum of prefixes:

void prefix_sum_orig(struct array *src, struct array *dst) {
  int i, j;
  for (i = 0; i < src->s; i++) {
    dst->v[i] = 0;
    for (j = 0; j < i; j++) {
      dst->v[i] += src->v[j];
    }
  }
}

   The second and third versions are the kernels that we get with
clang -O3, if we using "restrict" in the arguments:

void prefix_sum_rest(struct array src[restrict], struct array dst[restrict]) {
  int i, j;
  for (i = 0; i < src->s; i++) {
    dst->v[i] = 0;
    for (j = 0; j < i; j++) {
      dst->v[i] += src->v[j];
    }
  }
}

   At -O3, this kernel is more-or-less equivalent to this one below,
which is the third version in the benchmark:

void prefix_sum_o3(struct array *src, struct array *dst) {
  int i, j;
  int size = src->s;
  if (0 < size) {
    int* dst_v = dst->v;
    i = 0;
    do {
      int tmp = 0;
      j = 0;
      if (j < i) {
        int* src_v = src->v;
        do  {
          tmp += src_v[j];
          dst_v[i] = tmp;
          j++;
        } while (j < i);
      }
      i++;
    } while (i < size);
  }
}

   Both these kernels, prefix_sum_rest and prefix_sum_o3, have very
similar runtime. Look at how LLVM has been able to hoist or sink
almost every load. However, the store into dst_v[i] remains at the
innermost loop. Why has LLVM not moved that store outside the
innermost loop? I have not found any reason why it would not do
otherwise. If we perform this optimization, than the gains are
substantial in this example. Below is the kernel that I wish LLVM
could produce:

void prefix_sum_fer(struct array *src, struct array *dst) {
  int i, j;
  int size = src->s;
  if (0 < size) {
    int* dst_v = dst->v;
    i = 0;
    do {
      int tmp = 0;
      j = 0;
      if (j < i) {
        int* src_v = src->v;
        do  {
          tmp += src_v[j];
          j++;
        } while (j < i);
      }
      dst_v[i] = tmp;
      i++;
    } while (i < size);
  }
}

    And below is the commands that I use to benchmark these tests on
an Intel 1.4GHz Core i5 with 4 GH, 1,600 MHz DDR3 running OSX 10.9.5:

$> clang -v
clang version 3.6.0 (trunk 224772)
Target: x86_64-apple-darwin13.4.0

$> clang -c -emit-llvm a.c -o a.bc ; opt -O3 -disable-inlining a.bc -o
a.opt ; llc a.opt -o a.s ; clang a.s -o a.out

$> ./a.out 100000 100 1 ; ./a.out 100000 100 2 ;  ./a.out 100000 100 3
;  ./a.out 100000 100 4

Kernel orig
Time spent = 13.248573
Final answer = -290229404

Kernel o3
Time spent = 2.513803
Final answer = -290229404

Kernel restrict
Time spent = 2.483664
Final answer = -290229404

Kernel fer
Time spent = 0.622676
Final answer = -290229404

    If I do not disable inlining, then there is no difference between
orig, o3 and restrict; however fer is still the fastest:

$> ./a.out 100000 100 1 ; ./a.out 100000 100 2 ;  ./a.out 100000 100 3
;  ./a.out 100000 100 4

Kernel orig
Time spent = 2.491038
Final answer = -290229404

Kernel o3
Time spent = 2.481961
Final answer = -290229404

Kernel restrict
Time spent = 2.482067
Final answer = -290229404

Kernel fer
Time spent = 0.626108
Final answer = -290229404

The benchmark that I have used in embedded in the text below:

---------  Begin Bench  -----------
#include <stdio.h>
#include <stdlib.h>
#include <time.h>

/*
   Benchmark that shows the annoying store in the innermost loop.
   author: fernando at dcc.ufmg.br
   date: Apr 16th 2015
 */

struct array {
  int s;
  int* v;
};

struct array* new_array(int N) {
  struct array *a = (struct array*)malloc(sizeof(struct array*));
  a->v = (int*)malloc(N * sizeof(int));
  a->s = N;
  return a;
}

void fill_array(struct array *a, int max) {
  const int up = 32767;
  int i;
  for (i = 0; i < a->s; i++) {
    a->v[i] = i * up % max;
  }
}

void print_array(struct array *a) {
  int i;
  for (i = 0; i < a->s; i++) {
    if (i % 10 == 0) {
      printf("\n");
    }
    printf("%8d", a->v[i]);
  }
  printf("\n");
}

// This is the original kernel, which performs a sum of prefixes.
void prefix_sum_orig(struct array *src, struct array *dst) {
  int i, j;
  for (i = 0; i < src->s; i++) {
    dst->v[i] = 0;
    for (j = 0; j < i; j++) {
      dst->v[i] += src->v[j];
    }
  }
}

// This is, more or less, the kernel that we get with restrict and -O3:
void prefix_sum_o3(struct array *src, struct array *dst) {
  int i, j;
  int size = src->s;
  if (0 < size) {
    int* dst_v = dst->v;
    i = 0;
    do {
      int tmp = 0;
      j = 0;
      if (j < i) {
        int* src_v = src->v;
        do  {
          tmp += src_v[j];
          dst_v[i] = tmp;
          j++;
        } while (j < i);
      }
      i++;
    } while (i < size);
  }
}

// Just to show that prefix_sum_o3 is sound, here I have the same kernel, with
// the "restrict" keywork.
void prefix_sum_rest(struct array src[restrict], struct array dst[restrict]) {
  int i, j;
  for (i = 0; i < src->s; i++) {
    dst->v[i] = 0;
    for (j = 0; j < i; j++) {
      dst->v[i] += src->v[j];
    }
  }
}

// Here is the kernel that I wish I could get with LLVM -O3 + restrict. I
// wish the store in the innermost loop could go away.
void prefix_sum_fer(struct array *src, struct array *dst) {
  int i, j;
  int size = src->s;
  if (0 < size) {
    int* dst_v = dst->v;
    i = 0;
    do {
      int tmp = 0;
      j = 0;
      if (j < i) {
        int* src_v = src->v;
        do  {
          tmp += src_v[j];
          j++;
        } while (j < i);
      }
      dst_v[i] = tmp;
      i++;
    } while (i < size);
  }
}

// This function is just to check if the kernels are producing equivalent
// results.
int sum_array(struct array *src) {
  int sum = 0, i;
  int* src_v = src->v;
  int size = src->s;
  for (i = 0; i < size; i++) {
    sum += src_v[i];
  }
  return sum;
}

int main(int argc, char** argv) {
  if (argc < 3) {
    printf("Syntax: %s <array_size> <max_elem> <kernel>\n", argv[0]);
    return 1;
  } else {
    clock_t begin, end;
    double time_spent;
    struct array *a0, *a1;
    const int N = atoi(argv[1]);
    const int M = atoi(argv[2]);
    const int K = atoi(argv[3]);
    printf("\n");
    a0 = new_array(N);
    a1 = new_array(N);
    fill_array(a0, M);
    begin = clock();
    switch(K) {
      case 1: printf("Kernel orig\n"); prefix_sum_orig(a0, a1); break;
      case 2: printf("Kernel o3\n"); prefix_sum_o3(a0, a1); break;
      case 3: printf("Kernel restrict\n"); prefix_sum_rest(a0, a1); break;
      case 4: printf("Kernel fer\n"); prefix_sum_fer(a0, a1); break;
    }
    end = clock();
    time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
    printf("Time spent = %lf\n", time_spent);
    printf("Final answer = %d\n", sum_array(a1));
  }
}
---------  End Bench  -----------

Regards,

Fernando