Fri Aug 21 13:50:46 PDT 2020

anhtuyen added a comment.

To avoid any issue with NDA, I wrote a simple test as follows. This test program tries to prove the fact that, the impact of Loop Idiom Recognize Pass (LIGP)’s replacing store with memset is not always a positive one.

1. Compiler **work.c** to an IR file called **work.ll**. We will use this IR file **work.ll** for both **LIRP** and **LIRP --disable-loop-idiom=memset **
2. Call opt with **-loop-idiom** to produce an IR file **containing memset** instruction

  opt -basic-aa -loop-idiom -S work.ll -o work.yes.ll

3. Call opt with **-loop-idiom --disable-loop-idiom=memset **to produce an IR file **without memset** instruction

  opt -basic-aa -loop-idiom --disable-loop-idiom=memset -S work.ll -o work.no.ll

4. Inspect to make sure LIRP did **replace store with memset in work.yes.ll, but not in work.no.ll **
5. Compile the **test.c,** and link it with the IR from (2) and then the IR from (3).

  clang++ -c test.c
  clang++ test.o work.yes.ll -o yes
  clang++ test.o work.no.ll -o no

6. Run both the executables on a quiet machine. On my performance machine, times spent are:

With memset: **Time elapsed: 1.4215**
Without memset: **Time elapsed: 1.3611**


  #include <stdio.h>
  #include <time.h>
  int work(int A[], int sizeI, int sizeL);
  int main() {
    int A[3] = {1, 2, 3};
    int res = 1;
    clock_t begin = clock();
    res = work(A, 9999, 3);
    clock_t end = clock();
    double time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
    printf("Time elapsed: %4.4f\n", time_spent);
    return res;


  int work(int A[], int sizeI, int arraySize) {
      for (int i = 0; i < sizeI; ++i)
        for (int j = 0; j < sizeI; ++j)
          for (int k = 0; k < arraySize; ++k)
            A[k] = 0;
    return A[arraySize - 1];

IR before calling opt with **-loop-idiom**

  ; ModuleID = './work.ll'
  source_filename = "work.c"
  target datalayout = "e-m:e-i64:64-n32:64"
  target triple = "powerpc64le-unknown-linux-gnu"
  ; Function Attrs: noinline nounwind
  define dso_local signext i32 @_Z4workPiii(i32* %A, i32 signext %sizeI, i32 signext %arraySize)  #0 {
    %cmp6 = icmp slt i32 0, %sizeI
    br i1 %cmp6, label %for.body.preheader, label %for.end12
  for.body.preheader:                               ; preds = %entry
    br label %for.body
  for.body:                                         ; preds = %for.body.preheader, %for.inc10
    %i.07 = phi i32 [ %inc11, %for.inc10 ], [ 0, %for.body.preheader ]
    %cmp23 = icmp slt i32 0, %sizeI
    br i1 %cmp23, label %for.body3.preheader, label %for.inc10
  for.body3.preheader:                              ; preds = %for.body
    br label %for.body3
  for.body3:                                        ; preds = %for.body3.preheader, %for.inc7
    %j.04 = phi i32 [ %inc8, %for.inc7 ], [ 0, %for.body3.preheader ]
    %cmp51 = icmp slt i32 0, %arraySize
    br i1 %cmp51, label %for.body6.preheader, label %for.inc7
  for.body6.preheader:                              ; preds = %for.body3
    br label %for.body6
  for.body6:                                        ; preds = %for.body6.preheader, %for.body6
    %k.02 = phi i32 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ]
    %idxprom = sext i32 %k.02 to i64
    %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom
    store i32 0, i32* %arrayidx, align 4
    %inc = add nsw i32 %k.02, 1
    %cmp5 = icmp slt i32 %inc, %arraySize
    br i1 %cmp5, label %for.body6, label %for.inc7.loopexit
  for.inc7.loopexit:                                ; preds = %for.body6
    br label %for.inc7
  for.inc7:                                         ; preds = %for.inc7.loopexit, %for.body3
    %inc8 = add nsw i32 %j.04, 1
    %cmp2 = icmp slt i32 %inc8, %sizeI
    br i1 %cmp2, label %for.body3, label %for.inc10.loopexit
  for.inc10.loopexit:                               ; preds = %for.inc7
    br label %for.inc10
  for.inc10:                                        ; preds = %for.inc10.loopexit, %for.body
    %inc11 = add nsw i32 %i.07, 1
    %cmp = icmp slt i32 %inc11, %sizeI
    br i1 %cmp, label %for.body, label %for.end12.loopexit
  for.end12.loopexit:                               ; preds = %for.inc10
    br label %for.end12
  for.end12:                                        ; preds = %for.end12.loopexit, %entry
    %sub = sub nsw i32 %arraySize, 1
    %idxprom13 = sext i32 %sub to i64
    %arrayidx14 = getelementptr inbounds i32, i32* %A, i64 %idxprom13
    %0 = load i32, i32* %arrayidx14, align 4
    ret i32 %0
  attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx,-spe" "unsafe-fp-math"="false" "use-soft-float"="false"

Extract from IR **with memset**

  for.body6.preheader:                              ; preds = %for.body3
    call void @llvm.memset.p0i8.i64(i8* align 4 %A1, i8 0, i64 %1, i1 false)
    br label %for.body6
  for.body6:                                        ; preds = %for.body6, %for.body6.preheader
    %k.02 = phi i32 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ]
    %idxprom = sext i32 %k.02 to i64
    %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom
    %inc = add nsw i32 %k.02, 1
    %cmp5 = icmp slt i32 %inc, %arraySize
    br i1 %cmp5, label %for.body6, label %for.inc7.loopexit

Extract from IR **without memset**

  for.body6.preheader:                              ; preds = %for.body3
    br label %for.body6
  for.body6:                                        ; preds = %for.body6, %for.body6.preheader
    %k.02 = phi i32 [ %inc, %for.body6 ], [ 0, %for.body6.preheader ]
    %idxprom = sext i32 %k.02 to i64
    %arrayidx = getelementptr inbounds i32, i32* %A, i64 %idxprom
    store i32 0, i32* %arrayidx, align 4
    %inc = add nsw i32 %k.02, 1
    %cmp5 = icmp slt i32 %inc, %arraySize
    br i1 %cmp5, label %for.body6, label %for.inc7.loopexit

