[PATCH] D97667: [loop-idiom] Hoist loop memcpys to loop preheader

Wed Apr 28 11:45:22 PDT 2021

zhuhan0 added a comment.

@tpopp I cannot reproduce your test failure with opt -O2 and -O3. My patch only affects memcpy intrinsics in the loop body. Therefore running your test case shouldn't hit my code. Output of opt -O3:

  ; ModuleID = 'reverse_4d_float_array.ll'
  source_filename = "__compute_module"
  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
  target triple = "x86_64-grtev4-linux-gnu"

  @0 = external dso_local unnamed_addr constant [96 x i8], align 16

  ; Function Attrs: nofree norecurse nosync nounwind uwtable
  define void @Reverse4DFloatArrayOnDim01.3(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
  entry:
    %0 = bitcast i8** %buffer_table to [4 x [3 x [2 x [1 x float]]]]**
    %1 = load [4 x [3 x [2 x [1 x float]]]]*, [4 x [3 x [2 x [1 x float]]]]** %0, align 8, !invariant.load !0, !dereferenceable !1, !align !2
    %2 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 88) to float*), align 8, !alias.scope !3, !noalias !6
    %3 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 0, i64 0, i64 0, i64 0
    store float %2, float* %3, align 16, !alias.scope !6, !noalias !3
    %4 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 92) to float*), align 4, !alias.scope !3, !noalias !6
    %5 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 0, i64 0, i64 1, i64 0
    store float %4, float* %5, align 4, !alias.scope !6, !noalias !3
    %6 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 80) to float*), align 16, !alias.scope !3, !noalias !6
    %7 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 0, i64 1, i64 0, i64 0
    store float %6, float* %7, align 8, !alias.scope !6, !noalias !3
    %8 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 84) to float*), align 4, !alias.scope !3, !noalias !6
    %9 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 0, i64 1, i64 1, i64 0
    store float %8, float* %9, align 4, !alias.scope !6, !noalias !3
    %10 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 72) to float*), align 8, !alias.scope !3, !noalias !6
    %11 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 0, i64 2, i64 0, i64 0
    store float %10, float* %11, align 16, !alias.scope !6, !noalias !3
    %12 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 76) to float*), align 4, !alias.scope !3, !noalias !6
    %13 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 0, i64 2, i64 1, i64 0
    store float %12, float* %13, align 4, !alias.scope !6, !noalias !3
    %14 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 64) to float*), align 16, !alias.scope !3, !noalias !6
    %15 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 1, i64 0, i64 0, i64 0
    store float %14, float* %15, align 8, !alias.scope !6, !noalias !3
    %16 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 68) to float*), align 4, !alias.scope !3, !noalias !6
    %17 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 1, i64 0, i64 1, i64 0
    store float %16, float* %17, align 4, !alias.scope !6, !noalias !3
    %18 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 56) to float*), align 8, !alias.scope !3, !noalias !6
    %19 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 1, i64 1, i64 0, i64 0
    store float %18, float* %19, align 16, !alias.scope !6, !noalias !3
    %20 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 60) to float*), align 4, !alias.scope !3, !noalias !6
    %21 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 1, i64 1, i64 1, i64 0
    store float %20, float* %21, align 4, !alias.scope !6, !noalias !3
    %22 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 48) to float*), align 16, !alias.scope !3, !noalias !6
    %23 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 1, i64 2, i64 0, i64 0
    store float %22, float* %23, align 8, !alias.scope !6, !noalias !3
    %24 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 52) to float*), align 4, !alias.scope !3, !noalias !6
    %25 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 1, i64 2, i64 1, i64 0
    store float %24, float* %25, align 4, !alias.scope !6, !noalias !3
    %26 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 2, i64 0, i64 0, i64 0
    %27 = load <4 x float>, <4 x float>* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 32) to <4 x float>*), align 16, !alias.scope !3, !noalias !6
    %shuffle = shufflevector <4 x float> %27, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
    %28 = bitcast float* %26 to <4 x float>*
    store <4 x float> %shuffle, <4 x float>* %28, align 16, !alias.scope !6, !noalias !3
    %29 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 2, i64 2, i64 0, i64 0
    %30 = load <4 x float>, <4 x float>* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 16) to <4 x float>*), align 16, !alias.scope !3, !noalias !6
    %shuffle7 = shufflevector <4 x float> %30, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
    %31 = bitcast float* %29 to <4 x float>*
    store <4 x float> %shuffle7, <4 x float>* %31, align 16, !alias.scope !6, !noalias !3
    %32 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 3, i64 1, i64 0, i64 0
    %33 = load <4 x float>, <4 x float>* bitcast ([96 x i8]* @0 to <4 x float>*), align 16, !alias.scope !3, !noalias !6
    %shuffle8 = shufflevector <4 x float> %33, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
    %34 = bitcast float* %32 to <4 x float>*
    store <4 x float> %shuffle8, <4 x float>* %34, align 16, !alias.scope !6, !noalias !3
    ret void
  }

  attributes #0 = { nofree norecurse nosync nounwind uwtable "denormal-fp-math"="preserve-sign" "no-frame-pointer-elim"="false" }

  !0 = !{}
  !1 = !{i64 96}
  !2 = !{i64 16}
  !3 = !{!4}
  !4 = !{!"buffer: {index:1, offset:0, size:96}", !5}
  !5 = !{!"XLA global AA domain"}
  !6 = !{!7}
  !7 = !{!"buffer: {index:0, offset:0, size:96}", !5}

Do you have different compiler args to hit this test failure? Or is this not even an llvm test case?

Repository:
  rG LLVM Github Monorepo

CHANGES SINCE LAST ACTION
  https://reviews.llvm.org/D97667/new/

https://reviews.llvm.org/D97667