[PATCH] D97667: [loop-idiom] Hoist loop memcpys to loop preheader

Wed Apr 28 04:13:35 PDT 2021

tpopp added a comment.

lebedev.ri's concerns seem to have been valid, so I'll be rolling this back. A test case in XLA that reverses data across certain dimensions in a multidimensional change fails with this patch. A sequence of loads and stores is converted into a single memcpy even though the ordering should be different across loads and stores.

Un-optimized input:

  ; ModuleID = '__compute_module'
  source_filename = "__compute_module"
  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-grtev4-linux-gnu"

  @0 = external dso_local unnamed_addr constant [96 x i8], align 16

  ; Function Attrs: uwtable
  define void @Reverse4DFloatArrayOnDim01.3(i8* %retval, i8* noalias %run_options, i8** noalias %params, i8** noalias %buffer_table, i64* noalias %prof_counters) #0 {
  entry:
    %reverse.2.invar_address.dim.3 = alloca i64, align 8
    %reverse.2.invar_address.dim.2 = alloca i64, align 8
    %reverse.2.invar_address.dim.1 = alloca i64, align 8
    %reverse.2.invar_address.dim.0 = alloca i64, align 8
    %0 = getelementptr inbounds i8*, i8** %buffer_table, i64 0
    %1 = load i8*, i8** %0, align 8, !invariant.load !0, !dereferenceable !1, !align !2
    %reverse.2 = bitcast i8* %1 to [4 x [3 x [2 x [1 x float]]]]*
    store i64 0, i64* %reverse.2.invar_address.dim.0, align 8
    br label %reverse.2.loop_header.dim.0

  reverse.2.loop_header.dim.0:                      ; preds = %reverse.2.loop_exit.dim.1, %entry
    %reverse.2.indvar.dim.0 = load i64, i64* %reverse.2.invar_address.dim.0, align 8
    %2 = icmp uge i64 %reverse.2.indvar.dim.0, 4
    br i1 %2, label %reverse.2.loop_exit.dim.0, label %reverse.2.loop_body.dim.0

  reverse.2.loop_body.dim.0:                        ; preds = %reverse.2.loop_header.dim.0
    store i64 0, i64* %reverse.2.invar_address.dim.1, align 8
    br label %reverse.2.loop_header.dim.1

  reverse.2.loop_header.dim.1:                      ; preds = %reverse.2.loop_exit.dim.2, %reverse.2.loop_body.dim.0
    %reverse.2.indvar.dim.1 = load i64, i64* %reverse.2.invar_address.dim.1, align 8
    %3 = icmp uge i64 %reverse.2.indvar.dim.1, 3
    br i1 %3, label %reverse.2.loop_exit.dim.1, label %reverse.2.loop_body.dim.1

  reverse.2.loop_body.dim.1:                        ; preds = %reverse.2.loop_header.dim.1
    store i64 0, i64* %reverse.2.invar_address.dim.2, align 8
    br label %reverse.2.loop_header.dim.2

  reverse.2.loop_header.dim.2:                      ; preds = %reverse.2.loop_exit.dim.3, %reverse.2.loop_body.dim.1
    %reverse.2.indvar.dim.2 = load i64, i64* %reverse.2.invar_address.dim.2, align 8
    %4 = icmp uge i64 %reverse.2.indvar.dim.2, 2
    br i1 %4, label %reverse.2.loop_exit.dim.2, label %reverse.2.loop_body.dim.2

  reverse.2.loop_body.dim.2:                        ; preds = %reverse.2.loop_header.dim.2
    store i64 0, i64* %reverse.2.invar_address.dim.3, align 8
    br label %reverse.2.loop_header.dim.3

  reverse.2.loop_header.dim.3:                      ; preds = %reverse.2.loop_body.dim.3, %reverse.2.loop_body.dim.2
    %reverse.2.indvar.dim.3 = load i64, i64* %reverse.2.invar_address.dim.3, align 8
    %5 = icmp uge i64 %reverse.2.indvar.dim.3, 1
    br i1 %5, label %reverse.2.loop_exit.dim.3, label %reverse.2.loop_body.dim.3

  reverse.2.loop_body.dim.3:                        ; preds = %reverse.2.loop_header.dim.3
    %6 = sub i64 3, %reverse.2.indvar.dim.0
    %7 = sub i64 2, %reverse.2.indvar.dim.1
    %8 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* bitcast ([96 x i8]* @0 to [4 x [3 x [2 x [1 x float]]]]*), i64 0, i64 %6, i64 %7, i64 %reverse.2.indvar.dim.2, i64 0
    %9 = load float, float* %8, align 4, !alias.scope !3, !noalias !6
    %10 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %reverse.2, i64 0, i64 %reverse.2.indvar.dim.0, i64 %reverse.2.indvar.dim.1, i64 %reverse.2.indvar.dim.2, i64 0
    store float %9, float* %10, align 4, !alias.scope !6, !noalias !3
    %invar.inc3 = add nuw nsw i64 %reverse.2.indvar.dim.3, 1
    store i64 %invar.inc3, i64* %reverse.2.invar_address.dim.3, align 8
    br label %reverse.2.loop_header.dim.3

  reverse.2.loop_exit.dim.3:                        ; preds = %reverse.2.loop_header.dim.3
    %invar.inc2 = add nuw nsw i64 %reverse.2.indvar.dim.2, 1
    store i64 %invar.inc2, i64* %reverse.2.invar_address.dim.2, align 8
    br label %reverse.2.loop_header.dim.2

  reverse.2.loop_exit.dim.2:                        ; preds = %reverse.2.loop_header.dim.2
    %invar.inc1 = add nuw nsw i64 %reverse.2.indvar.dim.1, 1
    store i64 %invar.inc1, i64* %reverse.2.invar_address.dim.1, align 8
    br label %reverse.2.loop_header.dim.1

  reverse.2.loop_exit.dim.1:                        ; preds = %reverse.2.loop_header.dim.1
    %invar.inc = add nuw nsw i64 %reverse.2.indvar.dim.0, 1
    store i64 %invar.inc, i64* %reverse.2.invar_address.dim.0, align 8
    br label %reverse.2.loop_header.dim.0

  reverse.2.loop_exit.dim.0:                        ; preds = %reverse.2.loop_header.dim.0
    ret void
  }

  attributes #0 = { uwtable "denormal-fp-math"="preserve-sign" "no-frame-pointer-elim"="false" }

  !0 = !{}
  !1 = !{i64 96}
  !2 = !{i64 16}
  !3 = !{!4}
  !4 = !{!"buffer: {index:1, offset:0, size:96}", !5}
  !5 = !{!"XLA global AA domain"}
  !6 = !{!7}
  !7 = !{!"buffer: {index:0, offset:0, size:96}", !5}

Before this patch:

  ; ModuleID = '__compute_module'
  source_filename = "__compute_module"
  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-grtev4-linux-gnu"

  ; Function Attrs: nofree norecurse nosync nounwind uwtable
  define void @Reverse4DFloatArrayOnDim01.3(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
  entry:
    %0 = bitcast i8** %buffer_table to [4 x [3 x [2 x [1 x float]]]]**
    %1 = load [4 x [3 x [2 x [1 x float]]]]*, [4 x [3 x [2 x [1 x float]]]]** %0, align 8, !invariant.load !0, !dereferenceable !1, !align !2
    %2 = bitcast [4 x [3 x [2 x [1 x float]]]]* %1 to <4 x i64>*
    store <4 x i64> <i64 4737786809096339456, i64 4733283209467920384, i64 4728779609839501312, i64 4724276010211082240>, <4 x i64>* %2, align 16
    %scevgep.1.1 = getelementptr [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 1, i64 1, i64 0, i64 0
    %3 = bitcast float* %scevgep.1.1 to <4 x i64>*
    store <4 x i64> <i64 4719772410582138880, i64 4710765211325300736, i64 4701758012068462592, i64 4692750812811624448>, <4 x i64>* %3, align 16
    %scevgep.2.2 = getelementptr [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 2, i64 2, i64 0, i64 0
    %4 = bitcast float* %scevgep.2.2 to <4 x i64>*
    store <4 x i64> <i64 4683743613553737728, i64 4665729215040061440, i64 4647714816524288000, i64 4611686019492741120>, <4 x i64>* %4, align 16
    ret void
  }

  attributes #0 = { nofree norecurse nosync nounwind uwtable "denormal-fp-math"="preserve-sign" "no-frame-pointer-elim"="false" }

  !0 = !{}
  !1 = !{i64 96}
  !2 = !{i64 16}

After this patch:

  ; ModuleID = '__compute_module'
  source_filename = "__compute_module"
  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-grtev4-linux-gnu"

  @0 = private unnamed_addr constant [96 x i8] c"\00\00\80?\00\00\00@\00\00@@\00\00\80@\00\00\A0@\00\00\C0@\00\00\E0@\00\00\00A\00\00\10A\00\00 A\00\000A\00\00 at A\00\00PA\00\00`A\00\00pA\00\00\80A\00\00\88A\00\00\90A\00\00\98A\00\00\A0A\00\00\A8A\00\00\B0A\00\00\B8A\00\00\C0A", align 16

  ; Function Attrs: nofree norecurse nosync nounwind uwtable
  define void @Reverse4DFloatArrayOnDim01.3(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
  entry:
    %0 = load i8*, i8** %buffer_table, align 8, !invariant.load !0, !dereferenceable !1, !align !2
    call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 16 dereferenceable(96) %0, i8* noundef nonnull align 8 dereferenceable(96) getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 88), i64 96, i1 false)
    ret void
  }

  ; Function Attrs: argmemonly nofree nosync nounwind willreturn
  declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1

  attributes #0 = { nofree norecurse nosync nounwind uwtable "denormal-fp-math"="preserve-sign" "no-frame-pointer-elim"="false" }
  attributes #1 = { argmemonly nofree nosync nounwind willreturn }

  !0 = !{}
  !1 = !{i64 96}
  !2 = !{i64 16}

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97667/new/

https://reviews.llvm.org/D97667