[PATCH] D106450: [InstCombine] Fold (gep (oneuse(gep Ptr, Idx0)), Idx1) -> (gep Ptr, (add Idx0, Idx1)) (PR51069)

Thu Jul 29 12:15:10 PDT 2021

Whitney added a comment.

This change is causing performance degradation on one of our important benchmarks. Here is an example to show the potential reason:

  @block = global i32 zeroinitializer, align 4

  define void @foo(i32* %j, i32 %k) {
  entry:
    br label %for.body

  for.body:
    %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
    %invariant0 = getelementptr i32, i32* @block, i32 %k
    %x = getelementptr i32, i32* %invariant0, i32 %i.01
    store i32 2, i32* %x, align 4
    %invariant1 = getelementptr i32, i32* @block, i32 %k
    %0 = load i32, i32* %j
    %y = getelementptr i32, i32* %invariant1, i32 %0
    store i32 1, i32* %y, align 4
    %inc = add nsw i32 %i.01, 1
    %cmp = icmp slt i32 %inc, 100
    br i1 %cmp, label %for.body, label %for.end

  for.end:
    ret void
  }

Without this change, %invariant0 and %invariant1 are CSE. And %invariant0 is LICM out of the loop.

  bash-5.0$ opt t.ll -S -early-cse -licm
  @block = global i32 0, align 4

  define void @foo(i32* %j, i32 %k) {
  entry:
    %invariant0 = getelementptr i32, i32* @block, i32 %k
    br label %for.body

  for.body:                                         ; preds = %for.body, %entry
    %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
    %x = getelementptr i32, i32* %invariant0, i32 %i.01
    store i32 2, i32* %x, align 4
    %0 = load i32, i32* %j, align 4
    %y = getelementptr i32, i32* %invariant0, i32 %0
    store i32 1, i32* %y, align 4
    %inc = add nsw i32 %i.01, 1
    %cmp = icmp slt i32 %inc, 100
    br i1 %cmp, label %for.body, label %for.end

  for.end:                                          ; preds = %for.body
    ret void
  }

After this change, %invariant0 and %invariant1 are inst-combined and turn into add instructions. There is no longer invariance instruction to LICM or common subexpressions to eliminate.

  bash-5.0$ opt t.ll -S -instcombine -early-cse -licm
  @block = global i32 0, align 4

  define void @foo(i32* %j, i32 %k) {
  entry:
    %0 = sext i32 %k to i64
    br label %for.body

  for.body:                                         ; preds = %for.body, %entry
    %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
    %1 = zext i32 %i.01 to i64
    %x.idx = add nsw i64 %1, %0
    %x = getelementptr i32, i32* @block, i64 %x.idx
    store i32 2, i32* %x, align 4
    %2 = load i32, i32* %j, align 4
    %3 = sext i32 %2 to i64
    %y.idx = add nsw i64 %3, %0
    %y = getelementptr i32, i32* @block, i64 %y.idx
    store i32 1, i32* %y, align 4
    %inc = add nuw nsw i32 %i.01, 1
    %cmp = icmp ult i32 %i.01, 99
    br i1 %cmp, label %for.body, label %for.end

  for.end:                                          ; preds = %for.body
    ret void
  }

The problem is more significant when the instructions are in a deeply nested loop, or the loop has a lot of iterations.

Can we either revert this change or guard it under an option and disable by default, until there is a solution?

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D106450/new/

https://reviews.llvm.org/D106450