[PATCH] D70516: Revert "[DependenceAnalysis] Dependecies for loads marked with "ivnariant.load" should not be shared with general accesses. Fix for https://bugs.llvm.org/show_bug.cgi?id=42151"
George Karpenkov via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Wed Nov 20 15:30:41 PST 2019
george.karpenkov created this revision.
george.karpenkov added reviewers: sanjoy.google, sanjoy, ebrevnov, jdoerfert, reames, chandlerc.
Herald added subscribers: Charusso, hiraditya.
Herald added a project: LLVM.
Revert "[DependenceAnalysis] Dependecies for loads marked with "ivnariant.load" should not be shared with general accesses. Fix for https://bugs.llvm.org/show_bug.cgi?id=42151"
This reverts commit 5f026b6d9e882941fde9b7e5dc0a2d807f7f24f5 <https://reviews.llvm.org/rG5f026b6d9e882941fde9b7e5dc0a2d807f7f24f5>.
We're (tensorflow.org/xla team) seeing some misscompiles with the new change
I'm still trying to come up with a useful/small/external example, but for now, the following IR:
; ModuleID = '__compute_module'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"
@0 = private unnamed_addr constant [4 x i8] c"\DB\0F\C9@"
@1 = private unnamed_addr constant [4 x i8] c"\00\00\00?"
; Function Attrs: uwtable
define void @jit_wrapped_fun.31(i8* %retval, i8* noalias %run_options, i8** noalias %params, i8** noalias %buffer_table, i64* noalias %prof_counters) #0 {
entry:
%fusion.invar_address.dim.2 = alloca i64
%fusion.invar_address.dim.1 = alloca i64
%fusion.invar_address.dim.0 = alloca i64
%fusion.1.invar_address.dim.2 = alloca i64
%fusion.1.invar_address.dim.1 = alloca i64
%fusion.1.invar_address.dim.0 = alloca i64
%0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1
%1 = load i8*, i8** %0, !invariant.load !0, !dereferenceable !1, !align !2
%parameter.3 = bitcast i8* %1 to [2 x [1 x [4 x float]]]*
%2 = getelementptr inbounds i8*, i8** %buffer_table, i64 5
%3 = load i8*, i8** %2, !invariant.load !0, !dereferenceable !1, !align !2
%fusion.1 = bitcast i8* %3 to [2 x [1 x [4 x float]]]*
store i64 0, i64* %fusion.1.invar_address.dim.0
br label %fusion.1.loop_header.dim.0
fusion.1.loop_header.dim.0: ; preds = %fusion.1.loop_exit.dim.1, %entry
%fusion.1.indvar.dim.0 = load i64, i64* %fusion.1.invar_address.dim.0
%4 = icmp uge i64 %fusion.1.indvar.dim.0, 2
br i1 %4, label %fusion.1.loop_exit.dim.0, label %fusion.1.loop_body.dim.0
fusion.1.loop_body.dim.0: ; preds = %fusion.1.loop_header.dim.0
store i64 0, i64* %fusion.1.invar_address.dim.1
br label %fusion.1.loop_header.dim.1
fusion.1.loop_header.dim.1: ; preds = %fusion.1.loop_exit.dim.2, %fusion.1.loop_body.dim.0
%fusion.1.indvar.dim.1 = load i64, i64* %fusion.1.invar_address.dim.1
%5 = icmp uge i64 %fusion.1.indvar.dim.1, 1
br i1 %5, label %fusion.1.loop_exit.dim.1, label %fusion.1.loop_body.dim.1
fusion.1.loop_body.dim.1: ; preds = %fusion.1.loop_header.dim.1
store i64 0, i64* %fusion.1.invar_address.dim.2
br label %fusion.1.loop_header.dim.2
fusion.1.loop_header.dim.2: ; preds = %fusion.1.loop_body.dim.2, %fusion.1.loop_body.dim.1
%fusion.1.indvar.dim.2 = load i64, i64* %fusion.1.invar_address.dim.2
%6 = icmp uge i64 %fusion.1.indvar.dim.2, 4
br i1 %6, label %fusion.1.loop_exit.dim.2, label %fusion.1.loop_body.dim.2
fusion.1.loop_body.dim.2: ; preds = %fusion.1.loop_header.dim.2
%7 = load float, float* bitcast ([4 x i8]* @0 to float*)
%8 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %fusion.1.indvar.dim.0, i64 0, i64 %fusion.1.indvar.dim.2
%9 = load float, float* %8, !invariant.load !0, !noalias !3
%10 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %fusion.1.indvar.dim.0, i64 0, i64 %fusion.1.indvar.dim.2
%11 = load float, float* %10, !invariant.load !0, !noalias !3
%12 = fmul float %9, %11
%13 = fmul float %7, %12
%14 = call float @llvm.log.f32(float %13)
%15 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %fusion.1, i64 0, i64 %fusion.1.indvar.dim.0, i64 0, i64 %fusion.1.indvar.dim.2
store float %14, float* %15, !alias.scope !7, !noalias !8
%invar.inc2 = add nuw nsw i64 %fusion.1.indvar.dim.2, 1
store i64 %invar.inc2, i64* %fusion.1.invar_address.dim.2
br label %fusion.1.loop_header.dim.2
fusion.1.loop_exit.dim.2: ; preds = %fusion.1.loop_header.dim.2
%invar.inc1 = add nuw nsw i64 %fusion.1.indvar.dim.1, 1
store i64 %invar.inc1, i64* %fusion.1.invar_address.dim.1
br label %fusion.1.loop_header.dim.1
fusion.1.loop_exit.dim.1: ; preds = %fusion.1.loop_header.dim.1
%invar.inc = add nuw nsw i64 %fusion.1.indvar.dim.0, 1
store i64 %invar.inc, i64* %fusion.1.invar_address.dim.0
br label %fusion.1.loop_header.dim.0
fusion.1.loop_exit.dim.0: ; preds = %fusion.1.loop_header.dim.0
%16 = getelementptr inbounds i8*, i8** %buffer_table, i64 4
%17 = load i8*, i8** %16, !invariant.load !0, !dereferenceable !9, !align !2
%parameter.1 = bitcast i8* %17 to float*
%18 = getelementptr inbounds i8*, i8** %buffer_table, i64 2
%19 = load i8*, i8** %18, !invariant.load !0, !dereferenceable !10, !align !2
%parameter.2 = bitcast i8* %19 to [3 x [1 x float]]*
%20 = getelementptr inbounds i8*, i8** %buffer_table, i64 0
%21 = load i8*, i8** %20, !invariant.load !0, !dereferenceable !11, !align !2
%fusion = bitcast i8* %21 to [2 x [3 x [4 x float]]]*
store i64 0, i64* %fusion.invar_address.dim.0
br label %fusion.loop_header.dim.0
fusion.loop_header.dim.0: ; preds = %fusion.loop_exit.dim.1, %fusion.1.loop_exit.dim.0
%fusion.indvar.dim.0 = load i64, i64* %fusion.invar_address.dim.0
%22 = icmp uge i64 %fusion.indvar.dim.0, 2
br i1 %22, label %fusion.loop_exit.dim.0, label %fusion.loop_body.dim.0
fusion.loop_body.dim.0: ; preds = %fusion.loop_header.dim.0
store i64 0, i64* %fusion.invar_address.dim.1
br label %fusion.loop_header.dim.1
fusion.loop_header.dim.1: ; preds = %fusion.loop_exit.dim.2, %fusion.loop_body.dim.0
%fusion.indvar.dim.1 = load i64, i64* %fusion.invar_address.dim.1
%23 = icmp uge i64 %fusion.indvar.dim.1, 3
br i1 %23, label %fusion.loop_exit.dim.1, label %fusion.loop_body.dim.1
fusion.loop_body.dim.1: ; preds = %fusion.loop_header.dim.1
store i64 0, i64* %fusion.invar_address.dim.2
br label %fusion.loop_header.dim.2
fusion.loop_header.dim.2: ; preds = %fusion.loop_body.dim.2, %fusion.loop_body.dim.1
%fusion.indvar.dim.2 = load i64, i64* %fusion.invar_address.dim.2
%24 = icmp uge i64 %fusion.indvar.dim.2, 4
br i1 %24, label %fusion.loop_exit.dim.2, label %fusion.loop_body.dim.2
fusion.loop_body.dim.2: ; preds = %fusion.loop_header.dim.2
%25 = mul nuw nsw i64 %fusion.indvar.dim.2, 1
%26 = add nuw nsw i64 0, %25
%27 = udiv i64 %26, 4
%28 = mul nuw nsw i64 %fusion.indvar.dim.0, 1
%29 = add nuw nsw i64 0, %28
%30 = udiv i64 %29, 2
%31 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %fusion.1, i64 0, i64 %29, i64 0, i64 %26
%32 = load float, float* %31, !alias.scope !7, !noalias !8
%33 = mul nuw nsw i64 %fusion.indvar.dim.1, 1
%34 = add nuw nsw i64 0, %33
%35 = udiv i64 %34, 3
%36 = load float, float* %parameter.1, !invariant.load !0, !noalias !3
%37 = getelementptr inbounds [3 x [1 x float]], [3 x [1 x float]]* %parameter.2, i64 0, i64 %34, i64 0
%38 = load float, float* %37, !invariant.load !0, !noalias !3
%39 = fsub float %36, %38
%40 = fmul float %39, %39
%41 = mul nuw nsw i64 %fusion.indvar.dim.2, 1
%42 = add nuw nsw i64 0, %41
%43 = udiv i64 %42, 4
%44 = mul nuw nsw i64 %fusion.indvar.dim.0, 1
%45 = add nuw nsw i64 0, %44
%46 = udiv i64 %45, 2
%47 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %45, i64 0, i64 %42
%48 = load float, float* %47, !invariant.load !0, !noalias !3
%49 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %45, i64 0, i64 %42
%50 = load float, float* %49, !invariant.load !0, !noalias !3
%51 = fmul float %48, %50
%52 = fdiv float %40, %51
%53 = fadd float %32, %52
%54 = fneg float %53
%55 = load float, float* bitcast ([4 x i8]* @1 to float*)
%56 = fmul float %54, %55
%57 = getelementptr inbounds [2 x [3 x [4 x float]]], [2 x [3 x [4 x float]]]* %fusion, i64 0, i64 %fusion.indvar.dim.0, i64 %fusion.indvar.dim.1, i64 %fusion.indvar.dim.2
store float %56, float* %57, !alias.scope !8, !noalias !12
%invar.inc5 = add nuw nsw i64 %fusion.indvar.dim.2, 1
store i64 %invar.inc5, i64* %fusion.invar_address.dim.2
br label %fusion.loop_header.dim.2
fusion.loop_exit.dim.2: ; preds = %fusion.loop_header.dim.2
%invar.inc4 = add nuw nsw i64 %fusion.indvar.dim.1, 1
store i64 %invar.inc4, i64* %fusion.invar_address.dim.1
br label %fusion.loop_header.dim.1
fusion.loop_exit.dim.1: ; preds = %fusion.loop_header.dim.1
%invar.inc3 = add nuw nsw i64 %fusion.indvar.dim.0, 1
store i64 %invar.inc3, i64* %fusion.invar_address.dim.0
br label %fusion.loop_header.dim.0
fusion.loop_exit.dim.0: ; preds = %fusion.loop_header.dim.0
%58 = getelementptr inbounds i8*, i8** %buffer_table, i64 3
%59 = load i8*, i8** %58, !invariant.load !0, !dereferenceable !2, !align !2
%tuple.30 = bitcast i8* %59 to [1 x i8*]*
%60 = bitcast [2 x [3 x [4 x float]]]* %fusion to i8*
%61 = getelementptr inbounds [1 x i8*], [1 x i8*]* %tuple.30, i64 0, i64 0
store i8* %60, i8** %61, !alias.scope !14, !noalias !8
ret void
}
; Function Attrs: nounwind readnone speculatable willreturn
declare float @llvm.log.f32(float) #1
attributes #0 = { uwtable "no-frame-pointer-elim"="false" }
attributes #1 = { nounwind readnone speculatable willreturn }
!0 = !{}
!1 = !{i64 32}
!2 = !{i64 8}
!3 = !{!4, !6}
!4 = !{!"buffer: {index:0, offset:0, size:96}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!"buffer: {index:5, offset:0, size:32}", !5}
!7 = !{!6}
!8 = !{!4}
!9 = !{i64 4}
!10 = !{i64 12}
!11 = !{i64 96}
!12 = !{!13, !6}
!13 = !{!"buffer: {index:3, offset:0, size:8}", !5}
!14 = !{!13}
gets (correctly) optimized to the one below without the change:
; ModuleID = '__compute_module'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"
; Function Attrs: nofree nounwind uwtable
define void @jit_wrapped_fun.31(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
entry:
%0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1
%1 = bitcast i8** %0 to [2 x [1 x [4 x float]]]**
%2 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %1, align 8, !invariant.load !0, !dereferenceable !1, !align !2
%3 = getelementptr inbounds i8*, i8** %buffer_table, i64 5
%4 = bitcast i8** %3 to [2 x [1 x [4 x float]]]**
%5 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %4, align 8, !invariant.load !0, !dereferenceable !1, !align !2
%6 = bitcast [2 x [1 x [4 x float]]]* %2 to <4 x float>*
%7 = load <4 x float>, <4 x float>* %6, align 8, !invariant.load !0, !noalias !3
%8 = fmul <4 x float> %7, %7
%9 = fmul <4 x float> %8, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000>
%10 = call <4 x float> @llvm.log.v4f32(<4 x float> %9)
%11 = bitcast [2 x [1 x [4 x float]]]* %5 to <4 x float>*
store <4 x float> %10, <4 x float>* %11, align 8, !alias.scope !7, !noalias !8
%12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0
%13 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0
%14 = bitcast float* %12 to <4 x float>*
%15 = load <4 x float>, <4 x float>* %14, align 8, !invariant.load !0, !noalias !3
%16 = fmul <4 x float> %15, %15
%17 = fmul <4 x float> %16, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000>
%18 = call <4 x float> @llvm.log.v4f32(<4 x float> %17)
%19 = bitcast float* %13 to <4 x float>*
store <4 x float> %18, <4 x float>* %19, align 8, !alias.scope !7, !noalias !8
%20 = getelementptr inbounds i8*, i8** %buffer_table, i64 4
%21 = bitcast i8** %20 to float**
%22 = load float*, float** %21, align 8, !invariant.load !0, !dereferenceable !9, !align !2
%23 = getelementptr inbounds i8*, i8** %buffer_table, i64 2
%24 = bitcast i8** %23 to [3 x [1 x float]]**
%25 = load [3 x [1 x float]]*, [3 x [1 x float]]** %24, align 8, !invariant.load !0, !dereferenceable !10, !align !2
%26 = load i8*, i8** %buffer_table, align 8, !invariant.load !0, !dereferenceable !11, !align !2
%27 = load float, float* %22, align 8, !invariant.load !0, !noalias !3
%.phi.trans.insert28 = getelementptr inbounds [3 x [1 x float]], [3 x [1 x float]]* %25, i64 0, i64 2, i64 0
%.pre29 = load float, float* %.phi.trans.insert28, align 8, !invariant.load !0, !noalias !3
%28 = bitcast [3 x [1 x float]]* %25 to <2 x float>*
%29 = load <2 x float>, <2 x float>* %28, align 8, !invariant.load !0, !noalias !3
%30 = insertelement <2 x float> undef, float %27, i32 0
%31 = shufflevector <2 x float> %30, <2 x float> undef, <2 x i32> zeroinitializer
%32 = fsub <2 x float> %31, %29
%33 = fmul <2 x float> %32, %32
%shuffle30 = shufflevector <2 x float> %33, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
%34 = fsub float %27, %.pre29
%35 = fmul float %34, %34
%36 = insertelement <4 x float> undef, float %35, i32 0
%37 = shufflevector <4 x float> %36, <4 x float> undef, <4 x i32> zeroinitializer
%shuffle = shufflevector <4 x float> %10, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
%38 = fmul <4 x float> %7, %7
%shuffle31 = shufflevector <4 x float> %38, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
%39 = fdiv <8 x float> %shuffle30, %shuffle31
%40 = fadd <8 x float> %shuffle, %39
%41 = fmul <8 x float> %40, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
%42 = bitcast i8* %26 to <8 x float>*
store <8 x float> %41, <8 x float>* %42, align 8, !alias.scope !8, !noalias !12
%43 = getelementptr inbounds i8, i8* %26, i64 32
%44 = fdiv <4 x float> %37, %38
%45 = fadd <4 x float> %10, %44
%46 = fmul <4 x float> %45, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
%47 = bitcast i8* %43 to <4 x float>*
store <4 x float> %46, <4 x float>* %47, align 8, !alias.scope !8, !noalias !12
%.phi.trans.insert = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0
%.phi.trans.insert12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0
%48 = bitcast float* %.phi.trans.insert to <4 x float>*
%49 = load <4 x float>, <4 x float>* %48, align 8, !alias.scope !7, !noalias !8
%50 = bitcast float* %.phi.trans.insert12 to <4 x float>*
%51 = load <4 x float>, <4 x float>* %50, align 8, !invariant.load !0, !noalias !3
%shuffle.1 = shufflevector <4 x float> %49, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
%52 = getelementptr inbounds i8, i8* %26, i64 48
%53 = fmul <4 x float> %51, %51
%shuffle31.1 = shufflevector <4 x float> %53, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
%54 = fdiv <8 x float> %shuffle30, %shuffle31.1
%55 = fadd <8 x float> %shuffle.1, %54
%56 = fmul <8 x float> %55, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
%57 = bitcast i8* %52 to <8 x float>*
store <8 x float> %56, <8 x float>* %57, align 8, !alias.scope !8, !noalias !12
%58 = getelementptr inbounds i8, i8* %26, i64 80
%59 = fdiv <4 x float> %37, %53
%60 = fadd <4 x float> %49, %59
%61 = fmul <4 x float> %60, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
%62 = bitcast i8* %58 to <4 x float>*
store <4 x float> %61, <4 x float>* %62, align 8, !alias.scope !8, !noalias !12
%63 = getelementptr inbounds i8*, i8** %buffer_table, i64 3
%64 = bitcast i8** %63 to [1 x i8*]**
%65 = load [1 x i8*]*, [1 x i8*]** %64, align 8, !invariant.load !0, !dereferenceable !2, !align !2
%66 = getelementptr inbounds [1 x i8*], [1 x i8*]* %65, i64 0, i64 0
store i8* %26, i8** %66, align 8, !alias.scope !14, !noalias !8
ret void
}
; Function Attrs: nounwind readnone speculatable willreturn
declare <4 x float> @llvm.log.v4f32(<4 x float>) #1
attributes #0 = { nofree nounwind uwtable "no-frame-pointer-elim"="false" }
attributes #1 = { nounwind readnone speculatable willreturn }
!0 = !{}
!1 = !{i64 32}
!2 = !{i64 8}
!3 = !{!4, !6}
!4 = !{!"buffer: {index:0, offset:0, size:96}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!"buffer: {index:5, offset:0, size:32}", !5}
!7 = !{!6}
!8 = !{!4}
!9 = !{i64 4}
!10 = !{i64 12}
!11 = !{i64 96}
!12 = !{!13, !6}
!13 = !{!"buffer: {index:3, offset:0, size:8}", !5}
!14 = !{!13}
and (incorrectly) optimized to the one below with the change:
; ModuleID = '__compute_module'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"
; Function Attrs: nofree nounwind uwtable
define void @jit_wrapped_fun.31(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
entry:
%0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1
%1 = bitcast i8** %0 to [2 x [1 x [4 x float]]]**
%2 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %1, align 8, !invariant.load !0, !dereferenceable !1, !align !2
%3 = getelementptr inbounds i8*, i8** %buffer_table, i64 5
%4 = bitcast i8** %3 to [2 x [1 x [4 x float]]]**
%5 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %4, align 8, !invariant.load !0, !dereferenceable !1, !align !2
%6 = bitcast [2 x [1 x [4 x float]]]* %2 to <4 x float>*
%7 = load <4 x float>, <4 x float>* %6, align 8, !invariant.load !0, !noalias !3
%8 = fmul <4 x float> %7, %7
%9 = fmul <4 x float> %8, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000>
%10 = call <4 x float> @llvm.log.v4f32(<4 x float> %9)
%11 = bitcast [2 x [1 x [4 x float]]]* %5 to <4 x float>*
store <4 x float> %10, <4 x float>* %11, align 8, !alias.scope !7, !noalias !8
%12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0
%13 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0
%14 = bitcast float* %12 to <4 x float>*
%15 = load <4 x float>, <4 x float>* %14, align 8, !invariant.load !0, !noalias !3
%16 = fmul <4 x float> %15, %15
%17 = fmul <4 x float> %16, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000>
%18 = call <4 x float> @llvm.log.v4f32(<4 x float> %17)
%19 = bitcast float* %13 to <4 x float>*
store <4 x float> %18, <4 x float>* %19, align 8, !alias.scope !7, !noalias !8
%20 = getelementptr inbounds i8*, i8** %buffer_table, i64 4
%21 = bitcast i8** %20 to float**
%22 = load float*, float** %21, align 8, !invariant.load !0, !dereferenceable !9, !align !2
%23 = getelementptr inbounds i8*, i8** %buffer_table, i64 2
%24 = bitcast i8** %23 to [3 x [1 x float]]**
%25 = load [3 x [1 x float]]*, [3 x [1 x float]]** %24, align 8, !invariant.load !0, !dereferenceable !10, !align !2
%26 = load i8*, i8** %buffer_table, align 8, !invariant.load !0, !dereferenceable !11, !align !2
%27 = load float, float* %22, align 8, !invariant.load !0, !noalias !3
%.phi.trans.insert28 = getelementptr inbounds [3 x [1 x float]], [3 x [1 x float]]* %25, i64 0, i64 2, i64 0
%.pre29 = load float, float* %.phi.trans.insert28, align 8, !invariant.load !0, !noalias !3
%28 = bitcast [3 x [1 x float]]* %25 to <2 x float>*
%29 = load <2 x float>, <2 x float>* %28, align 8, !invariant.load !0, !noalias !3
%30 = insertelement <2 x float> undef, float %27, i32 0
%31 = shufflevector <2 x float> %30, <2 x float> undef, <2 x i32> zeroinitializer
%32 = fsub <2 x float> %31, %29
%33 = fmul <2 x float> %32, %32
%shuffle32 = shufflevector <2 x float> %33, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
%34 = fsub float %27, %.pre29
%35 = fmul float %34, %34
%36 = insertelement <4 x float> undef, float %35, i32 0
%37 = shufflevector <4 x float> %36, <4 x float> undef, <4 x i32> zeroinitializer
%shuffle = shufflevector <4 x float> %10, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
%38 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 0, i64 0, i64 3
%39 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 0, i64 0, i64 3
%40 = fmul <4 x float> %7, %7
%41 = shufflevector <4 x float> %40, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
%42 = fdiv <8 x float> %shuffle32, %41
%43 = fadd <8 x float> %shuffle, %42
%44 = fmul <8 x float> %43, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
%45 = bitcast i8* %26 to <8 x float>*
store <8 x float> %44, <8 x float>* %45, align 8, !alias.scope !8, !noalias !12
%46 = extractelement <4 x float> %10, i32 0
%47 = getelementptr inbounds i8, i8* %26, i64 32
%48 = extractelement <4 x float> %10, i32 1
%49 = extractelement <4 x float> %10, i32 2
%50 = load float, float* %38, align 4, !alias.scope !7, !noalias !8
%51 = load float, float* %39, align 4, !invariant.load !0, !noalias !3
%52 = fmul float %51, %51
%53 = insertelement <4 x float> undef, float %52, i32 3
%54 = fdiv <4 x float> %37, %53
%55 = insertelement <4 x float> undef, float %46, i32 0
%56 = insertelement <4 x float> %55, float %48, i32 1
%57 = insertelement <4 x float> %56, float %49, i32 2
%58 = insertelement <4 x float> %57, float %50, i32 3
%59 = fadd <4 x float> %58, %54
%60 = fmul <4 x float> %59, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
%61 = bitcast i8* %47 to <4 x float>*
store <4 x float> %60, <4 x float>* %61, align 8, !alias.scope !8, !noalias !12
%.phi.trans.insert = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0
%.phi.trans.insert12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0
%62 = bitcast float* %.phi.trans.insert to <4 x float>*
%63 = load <4 x float>, <4 x float>* %62, align 8, !alias.scope !7, !noalias !8
%64 = bitcast float* %.phi.trans.insert12 to <4 x float>*
%65 = load <4 x float>, <4 x float>* %64, align 8, !invariant.load !0, !noalias !3
%shuffle.1 = shufflevector <4 x float> %63, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
%66 = getelementptr inbounds i8, i8* %26, i64 48
%67 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 3
%68 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 3
%69 = fmul <4 x float> %65, %65
%70 = shufflevector <4 x float> %69, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
%71 = fdiv <8 x float> %shuffle32, %70
%72 = fadd <8 x float> %shuffle.1, %71
%73 = fmul <8 x float> %72, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
%74 = bitcast i8* %66 to <8 x float>*
store <8 x float> %73, <8 x float>* %74, align 8, !alias.scope !8, !noalias !12
%75 = extractelement <4 x float> %69, i32 0
%76 = extractelement <4 x float> %63, i32 0
%77 = getelementptr inbounds i8, i8* %26, i64 80
%78 = extractelement <4 x float> %69, i32 1
%79 = extractelement <4 x float> %63, i32 1
%80 = extractelement <4 x float> %69, i32 2
%81 = extractelement <4 x float> %63, i32 2
%82 = load float, float* %67, align 4, !alias.scope !7, !noalias !8
%83 = load float, float* %68, align 4, !invariant.load !0, !noalias !3
%84 = fmul float %83, %83
%85 = insertelement <4 x float> undef, float %75, i32 0
%86 = insertelement <4 x float> %85, float %78, i32 1
%87 = insertelement <4 x float> %86, float %80, i32 2
%88 = insertelement <4 x float> %87, float %84, i32 3
%89 = fdiv <4 x float> %37, %88
%90 = insertelement <4 x float> undef, float %76, i32 0
%91 = insertelement <4 x float> %90, float %79, i32 1
%92 = insertelement <4 x float> %91, float %81, i32 2
%93 = insertelement <4 x float> %92, float %82, i32 3
%94 = fadd <4 x float> %93, %89
%95 = fmul <4 x float> %94, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
%96 = bitcast i8* %77 to <4 x float>*
store <4 x float> %95, <4 x float>* %96, align 8, !alias.scope !8, !noalias !12
%97 = getelementptr inbounds i8*, i8** %buffer_table, i64 3
%98 = bitcast i8** %97 to [1 x i8*]**
%99 = load [1 x i8*]*, [1 x i8*]** %98, align 8, !invariant.load !0, !dereferenceable !2, !align !2
%100 = getelementptr inbounds [1 x i8*], [1 x i8*]* %99, i64 0, i64 0
store i8* %26, i8** %100, align 8, !alias.scope !14, !noalias !8
ret void
}
; Function Attrs: nounwind readnone speculatable willreturn
declare <4 x float> @llvm.log.v4f32(<4 x float>) #1
attributes #0 = { nofree nounwind uwtable "no-frame-pointer-elim"="false" }
attributes #1 = { nounwind readnone speculatable willreturn }
!0 = !{}
!1 = !{i64 32}
!2 = !{i64 8}
!3 = !{!4, !6}
!4 = !{!"buffer: {index:0, offset:0, size:96}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!"buffer: {index:5, offset:0, size:32}", !5}
!7 = !{!6}
!8 = !{!4}
!9 = !{i64 4}
!10 = !{i64 12}
!11 = !{i64 96}
!12 = !{!13, !6}
!13 = !{!"buffer: {index:3, offset:0, size:8}", !5}
!14 = !{!13}
This results in bad numerical answers when used through XLA.
The misscompile is only present at -O3.
Repository:
rG LLVM Github Monorepo
https://reviews.llvm.org/D70516
Files:
llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
Index: llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
===================================================================
--- llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -979,11 +979,6 @@
Instruction *QueryInst, const MemoryLocation &Loc, bool isLoad,
BasicBlock *BB, NonLocalDepInfo *Cache, unsigned NumSortedEntries) {
- bool isInvariantLoad = false;
-
- if (LoadInst *LI = dyn_cast_or_null<LoadInst>(QueryInst))
- isInvariantLoad = LI->getMetadata(LLVMContext::MD_invariant_load);
-
// Do a binary search to see if we already have an entry for this block in
// the cache set. If so, find it.
NonLocalDepInfo::iterator Entry = std::upper_bound(
@@ -995,13 +990,6 @@
if (Entry != Cache->begin() + NumSortedEntries && Entry->getBB() == BB)
ExistingResult = &*Entry;
- // Use cached result for invariant load only if there is no dependency for non
- // invariant load. In this case invariant load can not have any dependency as
- // well.
- if (ExistingResult && isInvariantLoad &&
- !ExistingResult->getResult().isNonFuncLocal())
- ExistingResult = nullptr;
-
// If we have a cached entry, and it is non-dirty, use it as the value for
// this dependency.
if (ExistingResult && !ExistingResult->getResult().isDirty()) {
@@ -1030,10 +1018,6 @@
MemDepResult Dep =
getPointerDependencyFrom(Loc, isLoad, ScanPos, BB, QueryInst);
- // Don't cache results for invariant load.
- if (isInvariantLoad)
- return Dep;
-
// If we had a dirty entry for the block, update it. Otherwise, just add
// a new entry.
if (ExistingResult)
@@ -1470,6 +1454,7 @@
if (SkipFirstBlock)
return false;
+ bool foundBlock = false;
for (NonLocalDepEntry &I : llvm::reverse(*Cache)) {
if (I.getBB() != BB)
continue;
@@ -1477,12 +1462,14 @@
assert((GotWorklistLimit || I.getResult().isNonLocal() ||
!DT.isReachableFromEntry(BB)) &&
"Should only be here with transparent block");
+ foundBlock = true;
I.setResult(MemDepResult::getUnknown());
+ Result.push_back(
+ NonLocalDepResult(I.getBB(), I.getResult(), Pointer.getAddr()));
break;
}
- // Go ahead and report unknown dependence.
- Result.push_back(
- NonLocalDepResult(BB, MemDepResult::getUnknown(), Pointer.getAddr()));
+ (void)foundBlock; (void)GotWorklistLimit;
+ assert((foundBlock || GotWorklistLimit) && "Current block not in cache?");
}
// Okay, we're done now. If we added new values to the cache, re-sort it.
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D70516.230344.patch
Type: text/x-patch
Size: 2620 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20191120/a7b4f9d8/attachment.bin>
More information about the llvm-commits
mailing list