[PATCH] D97667: [loop-idiom] Hoist loop memcpys to loop preheader
Theodore Popp via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Wed Apr 28 04:13:35 PDT 2021
tpopp added a comment.
lebedev.ri's concerns seem to have been valid, so I'll be rolling this back. A test case in XLA that reverses data across certain dimensions in a multidimensional change fails with this patch. A sequence of loads and stores is converted into a single memcpy even though the ordering should be different across loads and stores.
Un-optimized input:
; ModuleID = '__compute_module'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"
@0 = external dso_local unnamed_addr constant [96 x i8], align 16
; Function Attrs: uwtable
define void @Reverse4DFloatArrayOnDim01.3(i8* %retval, i8* noalias %run_options, i8** noalias %params, i8** noalias %buffer_table, i64* noalias %prof_counters) #0 {
entry:
%reverse.2.invar_address.dim.3 = alloca i64, align 8
%reverse.2.invar_address.dim.2 = alloca i64, align 8
%reverse.2.invar_address.dim.1 = alloca i64, align 8
%reverse.2.invar_address.dim.0 = alloca i64, align 8
%0 = getelementptr inbounds i8*, i8** %buffer_table, i64 0
%1 = load i8*, i8** %0, align 8, !invariant.load !0, !dereferenceable !1, !align !2
%reverse.2 = bitcast i8* %1 to [4 x [3 x [2 x [1 x float]]]]*
store i64 0, i64* %reverse.2.invar_address.dim.0, align 8
br label %reverse.2.loop_header.dim.0
reverse.2.loop_header.dim.0: ; preds = %reverse.2.loop_exit.dim.1, %entry
%reverse.2.indvar.dim.0 = load i64, i64* %reverse.2.invar_address.dim.0, align 8
%2 = icmp uge i64 %reverse.2.indvar.dim.0, 4
br i1 %2, label %reverse.2.loop_exit.dim.0, label %reverse.2.loop_body.dim.0
reverse.2.loop_body.dim.0: ; preds = %reverse.2.loop_header.dim.0
store i64 0, i64* %reverse.2.invar_address.dim.1, align 8
br label %reverse.2.loop_header.dim.1
reverse.2.loop_header.dim.1: ; preds = %reverse.2.loop_exit.dim.2, %reverse.2.loop_body.dim.0
%reverse.2.indvar.dim.1 = load i64, i64* %reverse.2.invar_address.dim.1, align 8
%3 = icmp uge i64 %reverse.2.indvar.dim.1, 3
br i1 %3, label %reverse.2.loop_exit.dim.1, label %reverse.2.loop_body.dim.1
reverse.2.loop_body.dim.1: ; preds = %reverse.2.loop_header.dim.1
store i64 0, i64* %reverse.2.invar_address.dim.2, align 8
br label %reverse.2.loop_header.dim.2
reverse.2.loop_header.dim.2: ; preds = %reverse.2.loop_exit.dim.3, %reverse.2.loop_body.dim.1
%reverse.2.indvar.dim.2 = load i64, i64* %reverse.2.invar_address.dim.2, align 8
%4 = icmp uge i64 %reverse.2.indvar.dim.2, 2
br i1 %4, label %reverse.2.loop_exit.dim.2, label %reverse.2.loop_body.dim.2
reverse.2.loop_body.dim.2: ; preds = %reverse.2.loop_header.dim.2
store i64 0, i64* %reverse.2.invar_address.dim.3, align 8
br label %reverse.2.loop_header.dim.3
reverse.2.loop_header.dim.3: ; preds = %reverse.2.loop_body.dim.3, %reverse.2.loop_body.dim.2
%reverse.2.indvar.dim.3 = load i64, i64* %reverse.2.invar_address.dim.3, align 8
%5 = icmp uge i64 %reverse.2.indvar.dim.3, 1
br i1 %5, label %reverse.2.loop_exit.dim.3, label %reverse.2.loop_body.dim.3
reverse.2.loop_body.dim.3: ; preds = %reverse.2.loop_header.dim.3
%6 = sub i64 3, %reverse.2.indvar.dim.0
%7 = sub i64 2, %reverse.2.indvar.dim.1
%8 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* bitcast ([96 x i8]* @0 to [4 x [3 x [2 x [1 x float]]]]*), i64 0, i64 %6, i64 %7, i64 %reverse.2.indvar.dim.2, i64 0
%9 = load float, float* %8, align 4, !alias.scope !3, !noalias !6
%10 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %reverse.2, i64 0, i64 %reverse.2.indvar.dim.0, i64 %reverse.2.indvar.dim.1, i64 %reverse.2.indvar.dim.2, i64 0
store float %9, float* %10, align 4, !alias.scope !6, !noalias !3
%invar.inc3 = add nuw nsw i64 %reverse.2.indvar.dim.3, 1
store i64 %invar.inc3, i64* %reverse.2.invar_address.dim.3, align 8
br label %reverse.2.loop_header.dim.3
reverse.2.loop_exit.dim.3: ; preds = %reverse.2.loop_header.dim.3
%invar.inc2 = add nuw nsw i64 %reverse.2.indvar.dim.2, 1
store i64 %invar.inc2, i64* %reverse.2.invar_address.dim.2, align 8
br label %reverse.2.loop_header.dim.2
reverse.2.loop_exit.dim.2: ; preds = %reverse.2.loop_header.dim.2
%invar.inc1 = add nuw nsw i64 %reverse.2.indvar.dim.1, 1
store i64 %invar.inc1, i64* %reverse.2.invar_address.dim.1, align 8
br label %reverse.2.loop_header.dim.1
reverse.2.loop_exit.dim.1: ; preds = %reverse.2.loop_header.dim.1
%invar.inc = add nuw nsw i64 %reverse.2.indvar.dim.0, 1
store i64 %invar.inc, i64* %reverse.2.invar_address.dim.0, align 8
br label %reverse.2.loop_header.dim.0
reverse.2.loop_exit.dim.0: ; preds = %reverse.2.loop_header.dim.0
ret void
}
attributes #0 = { uwtable "denormal-fp-math"="preserve-sign" "no-frame-pointer-elim"="false" }
!0 = !{}
!1 = !{i64 96}
!2 = !{i64 16}
!3 = !{!4}
!4 = !{!"buffer: {index:1, offset:0, size:96}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!7}
!7 = !{!"buffer: {index:0, offset:0, size:96}", !5}
Before this patch:
; ModuleID = '__compute_module'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"
; Function Attrs: nofree norecurse nosync nounwind uwtable
define void @Reverse4DFloatArrayOnDim01.3(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
entry:
%0 = bitcast i8** %buffer_table to [4 x [3 x [2 x [1 x float]]]]**
%1 = load [4 x [3 x [2 x [1 x float]]]]*, [4 x [3 x [2 x [1 x float]]]]** %0, align 8, !invariant.load !0, !dereferenceable !1, !align !2
%2 = bitcast [4 x [3 x [2 x [1 x float]]]]* %1 to <4 x i64>*
store <4 x i64> <i64 4737786809096339456, i64 4733283209467920384, i64 4728779609839501312, i64 4724276010211082240>, <4 x i64>* %2, align 16
%scevgep.1.1 = getelementptr [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 1, i64 1, i64 0, i64 0
%3 = bitcast float* %scevgep.1.1 to <4 x i64>*
store <4 x i64> <i64 4719772410582138880, i64 4710765211325300736, i64 4701758012068462592, i64 4692750812811624448>, <4 x i64>* %3, align 16
%scevgep.2.2 = getelementptr [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 2, i64 2, i64 0, i64 0
%4 = bitcast float* %scevgep.2.2 to <4 x i64>*
store <4 x i64> <i64 4683743613553737728, i64 4665729215040061440, i64 4647714816524288000, i64 4611686019492741120>, <4 x i64>* %4, align 16
ret void
}
attributes #0 = { nofree norecurse nosync nounwind uwtable "denormal-fp-math"="preserve-sign" "no-frame-pointer-elim"="false" }
!0 = !{}
!1 = !{i64 96}
!2 = !{i64 16}
After this patch:
; ModuleID = '__compute_module'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"
@0 = private unnamed_addr constant [96 x i8] c"\00\00\80?\00\00\00@\00\00@@\00\00\80@\00\00\A0@\00\00\C0@\00\00\E0@\00\00\00A\00\00\10A\00\00 A\00\000A\00\00 at A\00\00PA\00\00`A\00\00pA\00\00\80A\00\00\88A\00\00\90A\00\00\98A\00\00\A0A\00\00\A8A\00\00\B0A\00\00\B8A\00\00\C0A", align 16
; Function Attrs: nofree norecurse nosync nounwind uwtable
define void @Reverse4DFloatArrayOnDim01.3(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
entry:
%0 = load i8*, i8** %buffer_table, align 8, !invariant.load !0, !dereferenceable !1, !align !2
call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 16 dereferenceable(96) %0, i8* noundef nonnull align 8 dereferenceable(96) getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 88), i64 96, i1 false)
ret void
}
; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1
attributes #0 = { nofree norecurse nosync nounwind uwtable "denormal-fp-math"="preserve-sign" "no-frame-pointer-elim"="false" }
attributes #1 = { argmemonly nofree nosync nounwind willreturn }
!0 = !{}
!1 = !{i64 96}
!2 = !{i64 16}
Repository:
rG LLVM Github Monorepo
CHANGES SINCE LAST ACTION
https://reviews.llvm.org/D97667/new/
https://reviews.llvm.org/D97667
More information about the llvm-commits
mailing list