[PATCH] D70516: Revert "[DependenceAnalysis] Dependecies for loads marked with "ivnariant.load" should not be shared with general accesses. Fix for https://bugs.llvm.org/show_bug.cgi?id=42151"

George Karpenkov via Phabricator via llvm-commits llvm-commits at lists.llvm.org
Wed Nov 20 15:30:41 PST 2019


george.karpenkov created this revision.
george.karpenkov added reviewers: sanjoy.google, sanjoy, ebrevnov, jdoerfert, reames, chandlerc.
Herald added subscribers: Charusso, hiraditya.
Herald added a project: LLVM.

Revert "[DependenceAnalysis] Dependecies for loads marked with "ivnariant.load" should not be shared with general accesses. Fix for https://bugs.llvm.org/show_bug.cgi?id=42151"

This reverts commit 5f026b6d9e882941fde9b7e5dc0a2d807f7f24f5 <https://reviews.llvm.org/rG5f026b6d9e882941fde9b7e5dc0a2d807f7f24f5>.

We're (tensorflow.org/xla team) seeing some misscompiles with the new change
I'm still trying to come up with a useful/small/external example, but for now, the following IR:

  ; ModuleID = '__compute_module'
  source_filename = "__compute_module"
  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-grtev4-linux-gnu"
  
  @0 = private unnamed_addr constant [4 x i8] c"\DB\0F\C9@"
  @1 = private unnamed_addr constant [4 x i8] c"\00\00\00?"
  
  ; Function Attrs: uwtable
  define void @jit_wrapped_fun.31(i8* %retval, i8* noalias %run_options, i8** noalias %params, i8** noalias %buffer_table, i64* noalias %prof_counters) #0 {
  entry:
    %fusion.invar_address.dim.2 = alloca i64
    %fusion.invar_address.dim.1 = alloca i64
    %fusion.invar_address.dim.0 = alloca i64
    %fusion.1.invar_address.dim.2 = alloca i64
    %fusion.1.invar_address.dim.1 = alloca i64
    %fusion.1.invar_address.dim.0 = alloca i64
    %0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1
    %1 = load i8*, i8** %0, !invariant.load !0, !dereferenceable !1, !align !2
    %parameter.3 = bitcast i8* %1 to [2 x [1 x [4 x float]]]*
    %2 = getelementptr inbounds i8*, i8** %buffer_table, i64 5
    %3 = load i8*, i8** %2, !invariant.load !0, !dereferenceable !1, !align !2
    %fusion.1 = bitcast i8* %3 to [2 x [1 x [4 x float]]]*
    store i64 0, i64* %fusion.1.invar_address.dim.0
    br label %fusion.1.loop_header.dim.0
  
  fusion.1.loop_header.dim.0:                       ; preds = %fusion.1.loop_exit.dim.1, %entry
    %fusion.1.indvar.dim.0 = load i64, i64* %fusion.1.invar_address.dim.0
    %4 = icmp uge i64 %fusion.1.indvar.dim.0, 2
    br i1 %4, label %fusion.1.loop_exit.dim.0, label %fusion.1.loop_body.dim.0
  
  fusion.1.loop_body.dim.0:                         ; preds = %fusion.1.loop_header.dim.0
    store i64 0, i64* %fusion.1.invar_address.dim.1
    br label %fusion.1.loop_header.dim.1
  
  fusion.1.loop_header.dim.1:                       ; preds = %fusion.1.loop_exit.dim.2, %fusion.1.loop_body.dim.0
    %fusion.1.indvar.dim.1 = load i64, i64* %fusion.1.invar_address.dim.1
    %5 = icmp uge i64 %fusion.1.indvar.dim.1, 1
    br i1 %5, label %fusion.1.loop_exit.dim.1, label %fusion.1.loop_body.dim.1
  
  fusion.1.loop_body.dim.1:                         ; preds = %fusion.1.loop_header.dim.1
    store i64 0, i64* %fusion.1.invar_address.dim.2
    br label %fusion.1.loop_header.dim.2
  
  fusion.1.loop_header.dim.2:                       ; preds = %fusion.1.loop_body.dim.2, %fusion.1.loop_body.dim.1
    %fusion.1.indvar.dim.2 = load i64, i64* %fusion.1.invar_address.dim.2
    %6 = icmp uge i64 %fusion.1.indvar.dim.2, 4
    br i1 %6, label %fusion.1.loop_exit.dim.2, label %fusion.1.loop_body.dim.2
  
  fusion.1.loop_body.dim.2:                         ; preds = %fusion.1.loop_header.dim.2
    %7 = load float, float* bitcast ([4 x i8]* @0 to float*)
    %8 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %fusion.1.indvar.dim.0, i64 0, i64 %fusion.1.indvar.dim.2
    %9 = load float, float* %8, !invariant.load !0, !noalias !3
    %10 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %fusion.1.indvar.dim.0, i64 0, i64 %fusion.1.indvar.dim.2
    %11 = load float, float* %10, !invariant.load !0, !noalias !3
    %12 = fmul float %9, %11
    %13 = fmul float %7, %12
    %14 = call float @llvm.log.f32(float %13)
    %15 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %fusion.1, i64 0, i64 %fusion.1.indvar.dim.0, i64 0, i64 %fusion.1.indvar.dim.2
    store float %14, float* %15, !alias.scope !7, !noalias !8
    %invar.inc2 = add nuw nsw i64 %fusion.1.indvar.dim.2, 1
    store i64 %invar.inc2, i64* %fusion.1.invar_address.dim.2
    br label %fusion.1.loop_header.dim.2
  
  fusion.1.loop_exit.dim.2:                         ; preds = %fusion.1.loop_header.dim.2
    %invar.inc1 = add nuw nsw i64 %fusion.1.indvar.dim.1, 1
    store i64 %invar.inc1, i64* %fusion.1.invar_address.dim.1
    br label %fusion.1.loop_header.dim.1
  
  fusion.1.loop_exit.dim.1:                         ; preds = %fusion.1.loop_header.dim.1
    %invar.inc = add nuw nsw i64 %fusion.1.indvar.dim.0, 1
    store i64 %invar.inc, i64* %fusion.1.invar_address.dim.0
    br label %fusion.1.loop_header.dim.0
  
  fusion.1.loop_exit.dim.0:                         ; preds = %fusion.1.loop_header.dim.0
    %16 = getelementptr inbounds i8*, i8** %buffer_table, i64 4
    %17 = load i8*, i8** %16, !invariant.load !0, !dereferenceable !9, !align !2
    %parameter.1 = bitcast i8* %17 to float*
    %18 = getelementptr inbounds i8*, i8** %buffer_table, i64 2
    %19 = load i8*, i8** %18, !invariant.load !0, !dereferenceable !10, !align !2
    %parameter.2 = bitcast i8* %19 to [3 x [1 x float]]*
    %20 = getelementptr inbounds i8*, i8** %buffer_table, i64 0
    %21 = load i8*, i8** %20, !invariant.load !0, !dereferenceable !11, !align !2
    %fusion = bitcast i8* %21 to [2 x [3 x [4 x float]]]*
    store i64 0, i64* %fusion.invar_address.dim.0
    br label %fusion.loop_header.dim.0
  
  fusion.loop_header.dim.0:                         ; preds = %fusion.loop_exit.dim.1, %fusion.1.loop_exit.dim.0
    %fusion.indvar.dim.0 = load i64, i64* %fusion.invar_address.dim.0
    %22 = icmp uge i64 %fusion.indvar.dim.0, 2
    br i1 %22, label %fusion.loop_exit.dim.0, label %fusion.loop_body.dim.0
  
  fusion.loop_body.dim.0:                           ; preds = %fusion.loop_header.dim.0
    store i64 0, i64* %fusion.invar_address.dim.1
    br label %fusion.loop_header.dim.1
  
  fusion.loop_header.dim.1:                         ; preds = %fusion.loop_exit.dim.2, %fusion.loop_body.dim.0
    %fusion.indvar.dim.1 = load i64, i64* %fusion.invar_address.dim.1
    %23 = icmp uge i64 %fusion.indvar.dim.1, 3
    br i1 %23, label %fusion.loop_exit.dim.1, label %fusion.loop_body.dim.1
  
  fusion.loop_body.dim.1:                           ; preds = %fusion.loop_header.dim.1
    store i64 0, i64* %fusion.invar_address.dim.2
    br label %fusion.loop_header.dim.2
  
  fusion.loop_header.dim.2:                         ; preds = %fusion.loop_body.dim.2, %fusion.loop_body.dim.1
    %fusion.indvar.dim.2 = load i64, i64* %fusion.invar_address.dim.2
    %24 = icmp uge i64 %fusion.indvar.dim.2, 4
    br i1 %24, label %fusion.loop_exit.dim.2, label %fusion.loop_body.dim.2
  
  fusion.loop_body.dim.2:                           ; preds = %fusion.loop_header.dim.2
    %25 = mul nuw nsw i64 %fusion.indvar.dim.2, 1
    %26 = add nuw nsw i64 0, %25
    %27 = udiv i64 %26, 4
    %28 = mul nuw nsw i64 %fusion.indvar.dim.0, 1
    %29 = add nuw nsw i64 0, %28
    %30 = udiv i64 %29, 2
    %31 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %fusion.1, i64 0, i64 %29, i64 0, i64 %26
    %32 = load float, float* %31, !alias.scope !7, !noalias !8
    %33 = mul nuw nsw i64 %fusion.indvar.dim.1, 1
    %34 = add nuw nsw i64 0, %33
    %35 = udiv i64 %34, 3
    %36 = load float, float* %parameter.1, !invariant.load !0, !noalias !3
    %37 = getelementptr inbounds [3 x [1 x float]], [3 x [1 x float]]* %parameter.2, i64 0, i64 %34, i64 0
    %38 = load float, float* %37, !invariant.load !0, !noalias !3
    %39 = fsub float %36, %38
    %40 = fmul float %39, %39
    %41 = mul nuw nsw i64 %fusion.indvar.dim.2, 1
    %42 = add nuw nsw i64 0, %41
    %43 = udiv i64 %42, 4
    %44 = mul nuw nsw i64 %fusion.indvar.dim.0, 1
    %45 = add nuw nsw i64 0, %44
    %46 = udiv i64 %45, 2
    %47 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %45, i64 0, i64 %42
    %48 = load float, float* %47, !invariant.load !0, !noalias !3
    %49 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %parameter.3, i64 0, i64 %45, i64 0, i64 %42
    %50 = load float, float* %49, !invariant.load !0, !noalias !3
    %51 = fmul float %48, %50
    %52 = fdiv float %40, %51
    %53 = fadd float %32, %52
    %54 = fneg float %53
    %55 = load float, float* bitcast ([4 x i8]* @1 to float*)
    %56 = fmul float %54, %55
    %57 = getelementptr inbounds [2 x [3 x [4 x float]]], [2 x [3 x [4 x float]]]* %fusion, i64 0, i64 %fusion.indvar.dim.0, i64 %fusion.indvar.dim.1, i64 %fusion.indvar.dim.2
    store float %56, float* %57, !alias.scope !8, !noalias !12
    %invar.inc5 = add nuw nsw i64 %fusion.indvar.dim.2, 1
    store i64 %invar.inc5, i64* %fusion.invar_address.dim.2
    br label %fusion.loop_header.dim.2
  
  fusion.loop_exit.dim.2:                           ; preds = %fusion.loop_header.dim.2
    %invar.inc4 = add nuw nsw i64 %fusion.indvar.dim.1, 1
    store i64 %invar.inc4, i64* %fusion.invar_address.dim.1
    br label %fusion.loop_header.dim.1
  
  fusion.loop_exit.dim.1:                           ; preds = %fusion.loop_header.dim.1
    %invar.inc3 = add nuw nsw i64 %fusion.indvar.dim.0, 1
    store i64 %invar.inc3, i64* %fusion.invar_address.dim.0
    br label %fusion.loop_header.dim.0
  
  fusion.loop_exit.dim.0:                           ; preds = %fusion.loop_header.dim.0
    %58 = getelementptr inbounds i8*, i8** %buffer_table, i64 3
    %59 = load i8*, i8** %58, !invariant.load !0, !dereferenceable !2, !align !2
    %tuple.30 = bitcast i8* %59 to [1 x i8*]*
    %60 = bitcast [2 x [3 x [4 x float]]]* %fusion to i8*
    %61 = getelementptr inbounds [1 x i8*], [1 x i8*]* %tuple.30, i64 0, i64 0
    store i8* %60, i8** %61, !alias.scope !14, !noalias !8
    ret void
  }
  
  ; Function Attrs: nounwind readnone speculatable willreturn
  declare float @llvm.log.f32(float) #1
  
  attributes #0 = { uwtable "no-frame-pointer-elim"="false" }
  attributes #1 = { nounwind readnone speculatable willreturn }
  
  !0 = !{}
  !1 = !{i64 32}
  !2 = !{i64 8}
  !3 = !{!4, !6}
  !4 = !{!"buffer: {index:0, offset:0, size:96}", !5}
  !5 = !{!"XLA global AA domain"}
  !6 = !{!"buffer: {index:5, offset:0, size:32}", !5}
  !7 = !{!6}
  !8 = !{!4}
  !9 = !{i64 4}
  !10 = !{i64 12}
  !11 = !{i64 96}
  !12 = !{!13, !6}
  !13 = !{!"buffer: {index:3, offset:0, size:8}", !5}
  !14 = !{!13}

gets (correctly) optimized to the one below without the change:

  ; ModuleID = '__compute_module'
  source_filename = "__compute_module"
  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-grtev4-linux-gnu"
  
  ; Function Attrs: nofree nounwind uwtable
  define void @jit_wrapped_fun.31(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
  entry:
    %0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1
    %1 = bitcast i8** %0 to [2 x [1 x [4 x float]]]**
    %2 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %1, align 8, !invariant.load !0, !dereferenceable !1, !align !2
    %3 = getelementptr inbounds i8*, i8** %buffer_table, i64 5
    %4 = bitcast i8** %3 to [2 x [1 x [4 x float]]]**
    %5 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %4, align 8, !invariant.load !0, !dereferenceable !1, !align !2
    %6 = bitcast [2 x [1 x [4 x float]]]* %2 to <4 x float>*
    %7 = load <4 x float>, <4 x float>* %6, align 8, !invariant.load !0, !noalias !3
    %8 = fmul <4 x float> %7, %7
    %9 = fmul <4 x float> %8, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000>
    %10 = call <4 x float> @llvm.log.v4f32(<4 x float> %9)
    %11 = bitcast [2 x [1 x [4 x float]]]* %5 to <4 x float>*
    store <4 x float> %10, <4 x float>* %11, align 8, !alias.scope !7, !noalias !8
    %12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0
    %13 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0
    %14 = bitcast float* %12 to <4 x float>*
    %15 = load <4 x float>, <4 x float>* %14, align 8, !invariant.load !0, !noalias !3
    %16 = fmul <4 x float> %15, %15
    %17 = fmul <4 x float> %16, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000>
    %18 = call <4 x float> @llvm.log.v4f32(<4 x float> %17)
    %19 = bitcast float* %13 to <4 x float>*
    store <4 x float> %18, <4 x float>* %19, align 8, !alias.scope !7, !noalias !8
    %20 = getelementptr inbounds i8*, i8** %buffer_table, i64 4
    %21 = bitcast i8** %20 to float**
    %22 = load float*, float** %21, align 8, !invariant.load !0, !dereferenceable !9, !align !2
    %23 = getelementptr inbounds i8*, i8** %buffer_table, i64 2
    %24 = bitcast i8** %23 to [3 x [1 x float]]**
    %25 = load [3 x [1 x float]]*, [3 x [1 x float]]** %24, align 8, !invariant.load !0, !dereferenceable !10, !align !2
    %26 = load i8*, i8** %buffer_table, align 8, !invariant.load !0, !dereferenceable !11, !align !2
    %27 = load float, float* %22, align 8, !invariant.load !0, !noalias !3
    %.phi.trans.insert28 = getelementptr inbounds [3 x [1 x float]], [3 x [1 x float]]* %25, i64 0, i64 2, i64 0
    %.pre29 = load float, float* %.phi.trans.insert28, align 8, !invariant.load !0, !noalias !3
    %28 = bitcast [3 x [1 x float]]* %25 to <2 x float>*
    %29 = load <2 x float>, <2 x float>* %28, align 8, !invariant.load !0, !noalias !3
    %30 = insertelement <2 x float> undef, float %27, i32 0
    %31 = shufflevector <2 x float> %30, <2 x float> undef, <2 x i32> zeroinitializer
    %32 = fsub <2 x float> %31, %29
    %33 = fmul <2 x float> %32, %32
    %shuffle30 = shufflevector <2 x float> %33, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
    %34 = fsub float %27, %.pre29
    %35 = fmul float %34, %34
    %36 = insertelement <4 x float> undef, float %35, i32 0
    %37 = shufflevector <4 x float> %36, <4 x float> undef, <4 x i32> zeroinitializer
    %shuffle = shufflevector <4 x float> %10, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    %38 = fmul <4 x float> %7, %7
    %shuffle31 = shufflevector <4 x float> %38, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    %39 = fdiv <8 x float> %shuffle30, %shuffle31
    %40 = fadd <8 x float> %shuffle, %39
    %41 = fmul <8 x float> %40, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
    %42 = bitcast i8* %26 to <8 x float>*
    store <8 x float> %41, <8 x float>* %42, align 8, !alias.scope !8, !noalias !12
    %43 = getelementptr inbounds i8, i8* %26, i64 32
    %44 = fdiv <4 x float> %37, %38
    %45 = fadd <4 x float> %10, %44
    %46 = fmul <4 x float> %45, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
    %47 = bitcast i8* %43 to <4 x float>*
    store <4 x float> %46, <4 x float>* %47, align 8, !alias.scope !8, !noalias !12
    %.phi.trans.insert = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0
    %.phi.trans.insert12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0
    %48 = bitcast float* %.phi.trans.insert to <4 x float>*
    %49 = load <4 x float>, <4 x float>* %48, align 8, !alias.scope !7, !noalias !8
    %50 = bitcast float* %.phi.trans.insert12 to <4 x float>*
    %51 = load <4 x float>, <4 x float>* %50, align 8, !invariant.load !0, !noalias !3
    %shuffle.1 = shufflevector <4 x float> %49, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    %52 = getelementptr inbounds i8, i8* %26, i64 48
    %53 = fmul <4 x float> %51, %51
    %shuffle31.1 = shufflevector <4 x float> %53, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    %54 = fdiv <8 x float> %shuffle30, %shuffle31.1
    %55 = fadd <8 x float> %shuffle.1, %54
    %56 = fmul <8 x float> %55, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
    %57 = bitcast i8* %52 to <8 x float>*
    store <8 x float> %56, <8 x float>* %57, align 8, !alias.scope !8, !noalias !12
    %58 = getelementptr inbounds i8, i8* %26, i64 80
    %59 = fdiv <4 x float> %37, %53
    %60 = fadd <4 x float> %49, %59
    %61 = fmul <4 x float> %60, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
    %62 = bitcast i8* %58 to <4 x float>*
    store <4 x float> %61, <4 x float>* %62, align 8, !alias.scope !8, !noalias !12
    %63 = getelementptr inbounds i8*, i8** %buffer_table, i64 3
    %64 = bitcast i8** %63 to [1 x i8*]**
    %65 = load [1 x i8*]*, [1 x i8*]** %64, align 8, !invariant.load !0, !dereferenceable !2, !align !2
    %66 = getelementptr inbounds [1 x i8*], [1 x i8*]* %65, i64 0, i64 0
    store i8* %26, i8** %66, align 8, !alias.scope !14, !noalias !8
    ret void
  }
  
  ; Function Attrs: nounwind readnone speculatable willreturn
  declare <4 x float> @llvm.log.v4f32(<4 x float>) #1
  
  attributes #0 = { nofree nounwind uwtable "no-frame-pointer-elim"="false" }
  attributes #1 = { nounwind readnone speculatable willreturn }
  
  !0 = !{}
  !1 = !{i64 32}
  !2 = !{i64 8}
  !3 = !{!4, !6}
  !4 = !{!"buffer: {index:0, offset:0, size:96}", !5}
  !5 = !{!"XLA global AA domain"}
  !6 = !{!"buffer: {index:5, offset:0, size:32}", !5}
  !7 = !{!6}
  !8 = !{!4}
  !9 = !{i64 4}
  !10 = !{i64 12}
  !11 = !{i64 96}
  !12 = !{!13, !6}
  !13 = !{!"buffer: {index:3, offset:0, size:8}", !5}
  !14 = !{!13}

and (incorrectly) optimized to the one below with the change:

  ; ModuleID = '__compute_module'
  source_filename = "__compute_module"
  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-grtev4-linux-gnu"
  
  ; Function Attrs: nofree nounwind uwtable
  define void @jit_wrapped_fun.31(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
  entry:
    %0 = getelementptr inbounds i8*, i8** %buffer_table, i64 1
    %1 = bitcast i8** %0 to [2 x [1 x [4 x float]]]**
    %2 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %1, align 8, !invariant.load !0, !dereferenceable !1, !align !2
    %3 = getelementptr inbounds i8*, i8** %buffer_table, i64 5
    %4 = bitcast i8** %3 to [2 x [1 x [4 x float]]]**
    %5 = load [2 x [1 x [4 x float]]]*, [2 x [1 x [4 x float]]]** %4, align 8, !invariant.load !0, !dereferenceable !1, !align !2
    %6 = bitcast [2 x [1 x [4 x float]]]* %2 to <4 x float>*
    %7 = load <4 x float>, <4 x float>* %6, align 8, !invariant.load !0, !noalias !3
    %8 = fmul <4 x float> %7, %7
    %9 = fmul <4 x float> %8, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000>
    %10 = call <4 x float> @llvm.log.v4f32(<4 x float> %9)
    %11 = bitcast [2 x [1 x [4 x float]]]* %5 to <4 x float>*
    store <4 x float> %10, <4 x float>* %11, align 8, !alias.scope !7, !noalias !8
    %12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0
    %13 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0
    %14 = bitcast float* %12 to <4 x float>*
    %15 = load <4 x float>, <4 x float>* %14, align 8, !invariant.load !0, !noalias !3
    %16 = fmul <4 x float> %15, %15
    %17 = fmul <4 x float> %16, <float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000, float 0x401921FB60000000>
    %18 = call <4 x float> @llvm.log.v4f32(<4 x float> %17)
    %19 = bitcast float* %13 to <4 x float>*
    store <4 x float> %18, <4 x float>* %19, align 8, !alias.scope !7, !noalias !8
    %20 = getelementptr inbounds i8*, i8** %buffer_table, i64 4
    %21 = bitcast i8** %20 to float**
    %22 = load float*, float** %21, align 8, !invariant.load !0, !dereferenceable !9, !align !2
    %23 = getelementptr inbounds i8*, i8** %buffer_table, i64 2
    %24 = bitcast i8** %23 to [3 x [1 x float]]**
    %25 = load [3 x [1 x float]]*, [3 x [1 x float]]** %24, align 8, !invariant.load !0, !dereferenceable !10, !align !2
    %26 = load i8*, i8** %buffer_table, align 8, !invariant.load !0, !dereferenceable !11, !align !2
    %27 = load float, float* %22, align 8, !invariant.load !0, !noalias !3
    %.phi.trans.insert28 = getelementptr inbounds [3 x [1 x float]], [3 x [1 x float]]* %25, i64 0, i64 2, i64 0
    %.pre29 = load float, float* %.phi.trans.insert28, align 8, !invariant.load !0, !noalias !3
    %28 = bitcast [3 x [1 x float]]* %25 to <2 x float>*
    %29 = load <2 x float>, <2 x float>* %28, align 8, !invariant.load !0, !noalias !3
    %30 = insertelement <2 x float> undef, float %27, i32 0
    %31 = shufflevector <2 x float> %30, <2 x float> undef, <2 x i32> zeroinitializer
    %32 = fsub <2 x float> %31, %29
    %33 = fmul <2 x float> %32, %32
    %shuffle32 = shufflevector <2 x float> %33, <2 x float> undef, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 1, i32 1, i32 1, i32 1>
    %34 = fsub float %27, %.pre29
    %35 = fmul float %34, %34
    %36 = insertelement <4 x float> undef, float %35, i32 0
    %37 = shufflevector <4 x float> %36, <4 x float> undef, <4 x i32> zeroinitializer
    %shuffle = shufflevector <4 x float> %10, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    %38 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 0, i64 0, i64 3
    %39 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 0, i64 0, i64 3
    %40 = fmul <4 x float> %7, %7
    %41 = shufflevector <4 x float> %40, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
    %42 = fdiv <8 x float> %shuffle32, %41
    %43 = fadd <8 x float> %shuffle, %42
    %44 = fmul <8 x float> %43, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
    %45 = bitcast i8* %26 to <8 x float>*
    store <8 x float> %44, <8 x float>* %45, align 8, !alias.scope !8, !noalias !12
    %46 = extractelement <4 x float> %10, i32 0
    %47 = getelementptr inbounds i8, i8* %26, i64 32
    %48 = extractelement <4 x float> %10, i32 1
    %49 = extractelement <4 x float> %10, i32 2
    %50 = load float, float* %38, align 4, !alias.scope !7, !noalias !8
    %51 = load float, float* %39, align 4, !invariant.load !0, !noalias !3
    %52 = fmul float %51, %51
    %53 = insertelement <4 x float> undef, float %52, i32 3
    %54 = fdiv <4 x float> %37, %53
    %55 = insertelement <4 x float> undef, float %46, i32 0
    %56 = insertelement <4 x float> %55, float %48, i32 1
    %57 = insertelement <4 x float> %56, float %49, i32 2
    %58 = insertelement <4 x float> %57, float %50, i32 3
    %59 = fadd <4 x float> %58, %54
    %60 = fmul <4 x float> %59, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
    %61 = bitcast i8* %47 to <4 x float>*
    store <4 x float> %60, <4 x float>* %61, align 8, !alias.scope !8, !noalias !12
    %.phi.trans.insert = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 0
    %.phi.trans.insert12 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 0
    %62 = bitcast float* %.phi.trans.insert to <4 x float>*
    %63 = load <4 x float>, <4 x float>* %62, align 8, !alias.scope !7, !noalias !8
    %64 = bitcast float* %.phi.trans.insert12 to <4 x float>*
    %65 = load <4 x float>, <4 x float>* %64, align 8, !invariant.load !0, !noalias !3
    %shuffle.1 = shufflevector <4 x float> %63, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    %66 = getelementptr inbounds i8, i8* %26, i64 48
    %67 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %5, i64 0, i64 1, i64 0, i64 3
    %68 = getelementptr inbounds [2 x [1 x [4 x float]]], [2 x [1 x [4 x float]]]* %2, i64 0, i64 1, i64 0, i64 3
    %69 = fmul <4 x float> %65, %65
    %70 = shufflevector <4 x float> %69, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
    %71 = fdiv <8 x float> %shuffle32, %70
    %72 = fadd <8 x float> %shuffle.1, %71
    %73 = fmul <8 x float> %72, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
    %74 = bitcast i8* %66 to <8 x float>*
    store <8 x float> %73, <8 x float>* %74, align 8, !alias.scope !8, !noalias !12
    %75 = extractelement <4 x float> %69, i32 0
    %76 = extractelement <4 x float> %63, i32 0
    %77 = getelementptr inbounds i8, i8* %26, i64 80
    %78 = extractelement <4 x float> %69, i32 1
    %79 = extractelement <4 x float> %63, i32 1
    %80 = extractelement <4 x float> %69, i32 2
    %81 = extractelement <4 x float> %63, i32 2
    %82 = load float, float* %67, align 4, !alias.scope !7, !noalias !8
    %83 = load float, float* %68, align 4, !invariant.load !0, !noalias !3
    %84 = fmul float %83, %83
    %85 = insertelement <4 x float> undef, float %75, i32 0
    %86 = insertelement <4 x float> %85, float %78, i32 1
    %87 = insertelement <4 x float> %86, float %80, i32 2
    %88 = insertelement <4 x float> %87, float %84, i32 3
    %89 = fdiv <4 x float> %37, %88
    %90 = insertelement <4 x float> undef, float %76, i32 0
    %91 = insertelement <4 x float> %90, float %79, i32 1
    %92 = insertelement <4 x float> %91, float %81, i32 2
    %93 = insertelement <4 x float> %92, float %82, i32 3
    %94 = fadd <4 x float> %93, %89
    %95 = fmul <4 x float> %94, <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
    %96 = bitcast i8* %77 to <4 x float>*
    store <4 x float> %95, <4 x float>* %96, align 8, !alias.scope !8, !noalias !12
    %97 = getelementptr inbounds i8*, i8** %buffer_table, i64 3
    %98 = bitcast i8** %97 to [1 x i8*]**
    %99 = load [1 x i8*]*, [1 x i8*]** %98, align 8, !invariant.load !0, !dereferenceable !2, !align !2
    %100 = getelementptr inbounds [1 x i8*], [1 x i8*]* %99, i64 0, i64 0
    store i8* %26, i8** %100, align 8, !alias.scope !14, !noalias !8
    ret void
  }
  
  ; Function Attrs: nounwind readnone speculatable willreturn
  declare <4 x float> @llvm.log.v4f32(<4 x float>) #1
  
  attributes #0 = { nofree nounwind uwtable "no-frame-pointer-elim"="false" }
  attributes #1 = { nounwind readnone speculatable willreturn }
  
  !0 = !{}
  !1 = !{i64 32}
  !2 = !{i64 8}
  !3 = !{!4, !6}
  !4 = !{!"buffer: {index:0, offset:0, size:96}", !5}
  !5 = !{!"XLA global AA domain"}
  !6 = !{!"buffer: {index:5, offset:0, size:32}", !5}
  !7 = !{!6}
  !8 = !{!4}
  !9 = !{i64 4}
  !10 = !{i64 12}
  !11 = !{i64 96}
  !12 = !{!13, !6}
  !13 = !{!"buffer: {index:3, offset:0, size:8}", !5}
  !14 = !{!13}

This results in bad numerical answers when used through XLA.

The misscompile is only present at -O3.


Repository:
  rG LLVM Github Monorepo

https://reviews.llvm.org/D70516

Files:
  llvm/lib/Analysis/MemoryDependenceAnalysis.cpp


Index: llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
===================================================================
--- llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -979,11 +979,6 @@
     Instruction *QueryInst, const MemoryLocation &Loc, bool isLoad,
     BasicBlock *BB, NonLocalDepInfo *Cache, unsigned NumSortedEntries) {
 
-  bool isInvariantLoad = false;
-
-  if (LoadInst *LI = dyn_cast_or_null<LoadInst>(QueryInst))
-    isInvariantLoad = LI->getMetadata(LLVMContext::MD_invariant_load);
-
   // Do a binary search to see if we already have an entry for this block in
   // the cache set.  If so, find it.
   NonLocalDepInfo::iterator Entry = std::upper_bound(
@@ -995,13 +990,6 @@
   if (Entry != Cache->begin() + NumSortedEntries && Entry->getBB() == BB)
     ExistingResult = &*Entry;
 
-  // Use cached result for invariant load only if there is no dependency for non
-  // invariant load. In this case invariant load can not have any dependency as
-  // well.
-  if (ExistingResult && isInvariantLoad &&
-      !ExistingResult->getResult().isNonFuncLocal())
-    ExistingResult = nullptr;
-
   // If we have a cached entry, and it is non-dirty, use it as the value for
   // this dependency.
   if (ExistingResult && !ExistingResult->getResult().isDirty()) {
@@ -1030,10 +1018,6 @@
   MemDepResult Dep =
       getPointerDependencyFrom(Loc, isLoad, ScanPos, BB, QueryInst);
 
-  // Don't cache results for invariant load.
-  if (isInvariantLoad)
-    return Dep;
-
   // If we had a dirty entry for the block, update it.  Otherwise, just add
   // a new entry.
   if (ExistingResult)
@@ -1470,6 +1454,7 @@
     if (SkipFirstBlock)
       return false;
 
+    bool foundBlock = false;
     for (NonLocalDepEntry &I : llvm::reverse(*Cache)) {
       if (I.getBB() != BB)
         continue;
@@ -1477,12 +1462,14 @@
       assert((GotWorklistLimit || I.getResult().isNonLocal() ||
               !DT.isReachableFromEntry(BB)) &&
              "Should only be here with transparent block");
+      foundBlock = true;
       I.setResult(MemDepResult::getUnknown());
+      Result.push_back(
+          NonLocalDepResult(I.getBB(), I.getResult(), Pointer.getAddr()));
       break;
     }
-    // Go ahead and report unknown dependence.
-    Result.push_back(
-        NonLocalDepResult(BB, MemDepResult::getUnknown(), Pointer.getAddr()));
+    (void)foundBlock; (void)GotWorklistLimit;
+    assert((foundBlock || GotWorklistLimit) && "Current block not in cache?");
   }
 
   // Okay, we're done now.  If we added new values to the cache, re-sort it.


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D70516.230344.patch
Type: text/x-patch
Size: 2620 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20191120/a7b4f9d8/attachment.bin>


More information about the llvm-commits mailing list