[llvm] [clang] [flang] [InstCombine] Canonicalize constant GEPs to i8 source element type (PR #68882)

Fri Feb 2 15:42:51 PST 2024

Artem-B wrote:

Another corner case here. Untyped GEP resulted in SimpifyCFG producing a `load(gep(argptr, cond ? 24 : 0))` instead of `load( cond ? gep(argptr, 24) : argptr)` it produced before the patch, and that eventually prevented SROA from processing that load.

While it's not a bug in this patch, the consequence is a pretty serious performance regression in some GPU code. And we do not have a workaround. :-/ 

Minimized reproducer:
```
# opt -passes='inline,simplifycfg<bonus-inst-threshold=1;no-forward-switch-cond;switch-range-to-icmp;no-switch-to-lookup;keep-loops;no-hoist-common-insts;no-sink-common-insts;speculate-blocks;simplify-cond-branch>,instcombine<max-iterations=1;no-use-loop-info;no-verify-fixpoint>,sroa' -S <reproducer.ll
```

```
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"

%0 = type { [3 x i64], [17 x i64] }

; Function Attrs: nocallback nofree nounwind willreturn memory(argmem: readwrite)
declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #0

define i64 @foo3() {
bb:
  %alloca = alloca %0, align 8
  call void @llvm.memcpy.p0.p0.i64(ptr %alloca, ptr null, i64 160, i1 false)
  %call = call i64 @zot6(ptr %alloca)
  ret i64 0
}

define i64 @zot6(ptr %arg) {
bb:
  %load = load i32, ptr %arg, align 4
  %icmp = icmp eq i32 %load, 0
  br i1 %icmp, label %bb11, label %bb13

bb11:                                             ; preds = %bb
  %getelementptr = getelementptr %0, ptr %arg, i32 0, i32 1
  %load12 = load i32, ptr %getelementptr, align 4
  br label %bb15

bb13:                                             ; preds = %bb
  %load14 = load i32, ptr %arg, align 4
  br label %bb15

bb15:                                             ; preds = %bb13, %bb11
  %phi = phi i32 [ %load12, %bb11 ], [ %load14, %bb13 ]
  %icmp16 = icmp ne i32 %phi, 0
  call void @llvm.assume(i1 %icmp16)
  ret i64 0
}

; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
declare void @llvm.assume(i1 noundef) #1

attributes #0 = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
attributes #1 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
```

Before the change, foo3 would have the alloca split into two:

```
define i64 @foo3() {
bb:
  %alloca.sroa.2 = alloca [20 x i8], align 4
  %alloca.sroa.3 = alloca [132 x i8], align 4
  %alloca.sroa.0.0.copyload = load i32, ptr null, align 4294967296
```
After the change it remains untouched.

https://github.com/llvm/llvm-project/pull/68882