[PATCH] D73756: Revert of [InstCombine] Revert rL341831: relax one-use check in foldICmpAddCons… …tant() (PR44100)

Thu Jan 30 17:37:38 PST 2020

george.karpenkov created this revision.
Herald added subscribers: llvm-commits, sanjoy.google, Charusso, jfb, javed.absar, zzheng, hiraditya.
Herald added a project: LLVM.

Original commit message:

> rL341831 <https://reviews.llvm.org/rL341831> moved one-use check higher up, restricting a few folds
>  that produced a single instruction from two instructions to the case
>  where the inner instruction would go away.
> 
> Original commit message:
> 
>> InstCombine: move hasOneUse check to the top of foldICmpAddConstant
>> 
>> There were two combines not covered by the check before now,
>>  neither of which actually differed from normal in the benefit analysis.
>> 
>> The most recent seems to be because it was just added at the top of the
>>  function (naturally). The older is from way back in 2008 (r46687)
>>  when we just didn't put those checks in so routinely, and has been
>>  diligently maintained since.
> 
> From the commit message alone, there doesn't seem to be a
>  deeper motivation, deeper problem that was trying to solve,
>  other than 'fixing the wrong one-use check'.
> 
> As i have briefly discusses in IRC with Tim, the original motivation
>  can no longer be recovered, too much time has passed.
> 
> However i believe that the original fold was doing the right thing,
>  we should be performing such a transformation even if the inner `add`
>  will not go away - that will still unchain the comparison from `add`,
>  it will no longer need to wait for `add` to compute.
> 
> Doing so doesn't seem to break any particular idioms,
>  as least as far as i can see.
> 
> References https://bugs.llvm.org/show_bug.cgi?id=44100

The reverted causes OOB accesses in GPU code (for NVidia) compiled with LLVM.

For the input:

  ; ModuleID = 'ReduceR2_1000x1500_To_R1.8'
  source_filename = "ReduceR2_1000x1500_To_R1.8"
  target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
  target triple = "nvptx64-nvidia-cuda"

  @buffer_for_constant_2 = constant [4 x i8] zeroinitializer, align 64

  define void @reduce_7(i8* align 16 dereferenceable(1164) %alloc0, i8* align 64 dereferenceable(12) %alloc1) {
  entry:
    %output_y_in_tile.invar_address = alloca i32
    %input.1.raw = getelementptr inbounds i8, i8* %alloc0, i64 0
    %input.1.typed = bitcast i8* %input.1.raw to [97 x [3 x float]]*
    %reduce.7.raw = getelementptr inbounds i8, i8* %alloc1, i64 0
    %reduce.7.typed = bitcast i8* %reduce.7.raw to [3 x float]*
    %0 = alloca float
    %partial_reduction_result.0 = alloca float
    %1 = load float, float* bitcast ([4 x i8]* @buffer_for_constant_2 to float*), !invariant.load !2, !alias.scope !3, !noalias !6
    %2 = getelementptr inbounds float, float* %partial_reduction_result.0, i32 0
    store float %1, float* %2
    %3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !8
    %thread_id.x = urem i32 %3, 32
    %thread_id.y = udiv i32 %3, 32
    %lane_id = urem i32 %3, 32
    %4 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !9
    %5 = udiv i32 %4, 1
    %6 = urem i32 %5, 1
    %7 = udiv i32 %4, 1
    %8 = urem i32 %7, 1
    %9 = udiv i32 %4, 1
    %block_origin.z = mul i32 %9, 1
    %10 = icmp eq i32 %8, 0
    %tile_bound = select i1 %10, i32 97, i32 4096
    %11 = icmp eq i32 %6, 0
    %tile_bound1 = select i1 %11, i32 3, i32 32
    %tile_origin.1 = mul i32 %8, 4096
    %tile_origin.2 = mul i32 %6, 32
    %12 = add i32 %tile_origin.2, %thread_id.x
    store i32 %thread_id.y, i32* %output_y_in_tile.invar_address
    br label %output_y_in_tile.loop_header

  output_y_in_tile.loop_header:                     ; preds = %output_x_in_tile-after, %entry
    %output_y_in_tile.indvar = load i32, i32* %output_y_in_tile.invar_address
    %13 = icmp uge i32 %output_y_in_tile.indvar, %tile_bound
    br i1 %13, label %output_y_in_tile.loop_exit, label %output_y_in_tile.loop_body

  output_y_in_tile.loop_body:                       ; preds = %output_y_in_tile.loop_header
    %invar.inc = add nuw nsw i32 %output_y_in_tile.indvar, 32
    store i32 %invar.inc, i32* %output_y_in_tile.invar_address
    %14 = icmp eq i32 %output_y_in_tile.indvar, %thread_id.y
    %15 = add i32 %tile_origin.1, %output_y_in_tile.indvar
    %x_loc = add i32 0, %thread_id.x
    %16 = add i32 %12, 0
    %17 = icmp ult i32 %x_loc, %tile_bound1
    br i1 %17, label %output_x_in_tile-true, label %output_x_in_tile-false

  output_x_in_tile-after:                           ; preds = %output_x_in_tile-false, %output_x_in_tile-true
    br label %output_y_in_tile.loop_header, !llvm.loop !10

  output_y_in_tile.loop_exit:                       ; preds = %output_y_in_tile.loop_header
    %18 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !8
    %thread_id.x2 = urem i32 %18, 32
    %thread_id.y3 = udiv i32 %18, 32
    %lane_id4 = urem i32 %18, 32
    %19 = add i32 %tile_origin.1, %thread_id.y3
    %20 = add i32 %tile_origin.2, %thread_id.x2
    %21 = icmp ult i32 %thread_id.x2, %tile_bound1
    %22 = icmp ult i32 %thread_id.y3, %tile_bound
    %23 = and i1 %21, %22
    br i1 %23, label %output_inbound-true, label %output_inbound-false

  output_inbound-after:                             ; preds = %output_inbound-false, %output_inbound-true
    ret void

  output_x_in_tile-true:                            ; preds = %output_y_in_tile.loop_body
    %24 = mul nuw nsw i32 %16, 1
    %25 = add nuw nsw i32 0, %24
    %26 = mul nuw nsw i32 %15, 3
    %27 = add nuw nsw i32 %25, %26
    %28 = mul nuw nsw i32 %block_origin.z, 291
    %29 = add nuw nsw i32 %27, %28
    %30 = udiv i32 %29, 1
    %31 = urem i32 %30, 3
    %32 = udiv i32 %29, 3
    %33 = bitcast [97 x [3 x float]]* %input.1.typed to float*
    %34 = getelementptr inbounds float, float* %33, i32 %29
    %35 = load float, float* %34, !invariant.load !2, !noalias !12
    store float %35, float* %0
    %36 = getelementptr inbounds float, float* %partial_reduction_result.0, i32 0
    call void @add_F32_3(float* %36, float* %0, float* %36, i8* null)
    %37 = load float, float* %36
    br label %output_x_in_tile-after

  output_x_in_tile-false:                           ; preds = %output_y_in_tile.loop_body
    br label %output_x_in_tile-after

  output_inbound-true:                              ; preds = %output_y_in_tile.loop_exit
    %38 = add i32 %20, 0
    %39 = mul i32 %block_origin.z, 3
    %40 = add i32 %39, %38
    %41 = udiv i32 %40, 1
    %output_element_address = getelementptr inbounds [3 x float], [3 x float]* %reduce.7.typed, i32 0, i32 %41
    %42 = getelementptr inbounds float, float* %partial_reduction_result.0, i32 0
    %source = load float, float* %42
    %43 = atomicrmw fadd float* %output_element_address, float %source seq_cst
    br label %output_inbound-after

  output_inbound-false:                             ; preds = %output_y_in_tile.loop_exit
    br label %output_inbound-after
  }

  ; Function Attrs: nounwind readnone
  declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0

  ; Function Attrs: nounwind readnone
  declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0

  define internal void @add_F32_3(float* dereferenceable(4) %lhs.4.typed, float* dereferenceable(4) %rhs.5.typed, float* dereferenceable(4) %output_arg, i8* %temp_buffer) {
  entry:
    %add.6.typed = alloca float
    %0 = load float, float* %lhs.4.typed, !alias.scope !13, !noalias !15
    %1 = load float, float* %rhs.5.typed, !alias.scope !17, !noalias !15
    %2 = fadd float %0, %1
    store float %2, float* %add.6.typed, !alias.scope !15
    %load_ret_value = load float, float* %add.6.typed
    store float %load_ret_value, float* %output_arg
    ret void
  }

  attributes #0 = { nounwind readnone }

  !nvvm.annotations = !{!0, !1}

  !0 = !{void (i8*, i8*)* @reduce_7, !"kernel", i32 1}
  !1 = !{void (i8*, i8*)* @reduce_7, !"reqntidx", i32 1024}
  !2 = !{}
  !3 = !{!4}
  !4 = !{!"buffer: {index:2, offset:0, size:4}", !5}
  !5 = !{!"XLA global AA domain"}
  !6 = !{!7}
  !7 = !{!"buffer: {index:1, offset:0, size:12}", !5}
  !8 = !{i32 0, i32 1024}
  !9 = !{i32 0, i32 1}
  !10 = distinct !{!10, !11}
  !11 = !{!"llvm.loop.vectorize.enable", i1 false}
  !12 = !{!7, !4}
  !13 = !{!14}
  !14 = !{!"buffer: {index:3, offset:0, size:4}", !5}
  !15 = !{!16}
  !16 = !{!"buffer: {index:5, offset:0, size:4}", !5}
  !17 = !{!18}
  !18 = !{!"buffer: {index:4, offset:0, size:4}", !5}

with the revert (good) I get:

  ; ModuleID = 'ReduceR2_1000x1500_To_R1.8'
  source_filename = "ReduceR2_1000x1500_To_R1.8"
  target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
  target triple = "nvptx64-nvidia-cuda"

  @buffer_for_constant_2 = local_unnamed_addr addrspace(1) constant [4 x i8] zeroinitializer, align 64

  ; Function Attrs: nofree nounwind
  define void @reduce_7(i8* nocapture readonly align 16 dereferenceable(1164) %alloc0, i8* nocapture align 64 dereferenceable(12) %alloc1) local_unnamed_addr #0 {
  output_y_in_tile.loop_body.lr.ph:
    %alloc111 = addrspacecast i8* %alloc1 to i8 addrspace(1)*
    %alloc09 = addrspacecast i8* %alloc0 to i8 addrspace(1)*
    %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
    %thread_id.x = and i32 %0, 31
    %1 = icmp ult i32 %thread_id.x, 3
    br i1 %1, label %output_y_in_tile.loop_body.us.preheader, label %output_inbound-after

  output_y_in_tile.loop_body.us.preheader:          ; preds = %output_y_in_tile.loop_body.lr.ph
    %thread_id.y = lshr i32 %0, 5
    %2 = mul nuw nsw i32 %thread_id.y, 3
    %3 = add i32 %2, %thread_id.x
    %4 = zext i32 %3 to i64
    %5 = shl nuw nsw i64 %4, 2
    %scevgep = getelementptr i8, i8 addrspace(1)* %alloc09, i64 %5
    br label %output_y_in_tile.loop_body.us

  output_y_in_tile.loop_body.us:                    ; preds = %output_y_in_tile.loop_body.us.preheader, %output_y_in_tile.loop_body.us
    %lsr.iv = phi i8 addrspace(1)* [ %scevgep, %output_y_in_tile.loop_body.us.preheader ], [ %scevgep13, %output_y_in_tile.loop_body.us ]
    %output_y_in_tile.invar_address.06.us = phi i32 [ %invar.inc.us, %output_y_in_tile.loop_body.us ], [ %thread_id.y, %output_y_in_tile.loop_body.us.preheader ]
    %partial_reduction_result.0.05.us = phi float [ %7, %output_y_in_tile.loop_body.us ], [ 0.000000e+00, %output_y_in_tile.loop_body.us.preheader ]
    %lsr.iv14 = bitcast i8 addrspace(1)* %lsr.iv to float addrspace(1)*
    %invar.inc.us = add nuw nsw i32 %output_y_in_tile.invar_address.06.us, 32
    %6 = load float, float addrspace(1)* %lsr.iv14, align 4, !invariant.load !4, !noalias !5
    %7 = fadd float %partial_reduction_result.0.05.us, %6
    %scevgep13 = getelementptr i8, i8 addrspace(1)* %lsr.iv, i64 384
    %8 = icmp ugt i32 %invar.inc.us, 96
    br i1 %8, label %output_y_in_tile.loop_exit, label %output_y_in_tile.loop_body.us, !llvm.loop !9

  output_y_in_tile.loop_exit:                       ; preds = %output_y_in_tile.loop_body.us
    %9 = bitcast i8 addrspace(1)* %alloc111 to [3 x float] addrspace(1)*
    %10 = zext i32 %thread_id.x to i64
    %output_element_address = getelementptr inbounds [3 x float], [3 x float] addrspace(1)* %9, i64 0, i64 %10
    %11 = atomicrmw fadd float addrspace(1)* %output_element_address, float %7 seq_cst
    br label %output_inbound-after

  output_inbound-after:                             ; preds = %output_y_in_tile.loop_body.lr.ph, %output_y_in_tile.loop_exit
    ret void
  }

  ; Function Attrs: nounwind readnone
  declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1

  ; Function Attrs: nounwind
  declare void @llvm.stackprotector(i8*, i8**) #2

  attributes #0 = { nofree nounwind }
  attributes #1 = { nounwind readnone }
  attributes #2 = { nounwind }

  !nvvm.annotations = !{!0, !1}
  !llvm.module.flags = !{!2}

  !0 = !{void (i8*, i8*)* @reduce_7, !"kernel", i32 1}
  !1 = !{void (i8*, i8*)* @reduce_7, !"reqntidx", i32 1024}
  !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
  !3 = !{i32 0, i32 1024}
  !4 = !{}
  !5 = !{!6, !8}
  !6 = !{!"buffer: {index:1, offset:0, size:12}", !7}
  !7 = !{!"XLA global AA domain"}
  !8 = !{!"buffer: {index:2, offset:0, size:4}", !7}
  !9 = distinct !{!9, !10}
  !10 = !{!"llvm.loop.vectorize.enable", i1 false}

and without the revert (bad) I get:

  ; ModuleID = 'ReduceR2_1000x1500_To_R1.8'
  source_filename = "ReduceR2_1000x1500_To_R1.8"
  target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
  target triple = "nvptx64-nvidia-cuda"

  @buffer_for_constant_2 = local_unnamed_addr addrspace(1) constant [4 x i8] zeroinitializer, align 64

  ; Function Attrs: nofree nounwind
  define void @reduce_7(i8* nocapture readonly align 16 dereferenceable(1164) %alloc0, i8* nocapture align 64 dereferenceable(12) %alloc1) local_unnamed_addr #0 {
  output_y_in_tile.loop_body.lr.ph:
    %alloc111 = addrspacecast i8* %alloc1 to i8 addrspace(1)*
    %alloc09 = addrspacecast i8* %alloc0 to i8 addrspace(1)*
    %0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
    %thread_id.x = and i32 %0, 31
    %1 = icmp ult i32 %thread_id.x, 3
    br i1 %1, label %output_y_in_tile.loop_body.us.preheader, label %output_inbound-after

  output_y_in_tile.loop_body.us.preheader:          ; preds = %output_y_in_tile.loop_body.lr.ph
    %thread_id.y = lshr i32 %0, 5
    %2 = add nuw nsw i32 %thread_id.y, -32
    %3 = mul nuw nsw i32 %thread_id.y, 3
    %4 = add i32 %3, %thread_id.x
    %5 = zext i32 %4 to i64
    %6 = shl nuw nsw i64 %5, 2
    %scevgep = getelementptr i8, i8 addrspace(1)* %alloc09, i64 %6
    br label %output_y_in_tile.loop_body.us

  output_y_in_tile.loop_body.us:                    ; preds = %output_y_in_tile.loop_body.us.preheader, %output_y_in_tile.loop_body.us
    %lsr.iv13 = phi i8 addrspace(1)* [ %scevgep, %output_y_in_tile.loop_body.us.preheader ], [ %scevgep14, %output_y_in_tile.loop_body.us ]
    %lsr.iv = phi i32 [ %2, %output_y_in_tile.loop_body.us.preheader ], [ %lsr.iv.next, %output_y_in_tile.loop_body.us ]
    %partial_reduction_result.0.05.us = phi float [ %8, %output_y_in_tile.loop_body.us ], [ 0.000000e+00, %output_y_in_tile.loop_body.us.preheader ]
    %lsr.iv1315 = bitcast i8 addrspace(1)* %lsr.iv13 to float addrspace(1)*
    %7 = load float, float addrspace(1)* %lsr.iv1315, align 4, !invariant.load !4, !noalias !5
    %8 = fadd float %partial_reduction_result.0.05.us, %7
    %lsr.iv.next = add nsw i32 %lsr.iv, 32
    %scevgep14 = getelementptr i8, i8 addrspace(1)* %lsr.iv13, i64 384
    %9 = icmp ugt i32 %lsr.iv.next, 64
    br i1 %9, label %output_y_in_tile.loop_exit, label %output_y_in_tile.loop_body.us, !llvm.loop !9

  output_y_in_tile.loop_exit:                       ; preds = %output_y_in_tile.loop_body.us
    %10 = bitcast i8 addrspace(1)* %alloc111 to [3 x float] addrspace(1)*
    %11 = zext i32 %thread_id.x to i64
    %output_element_address = getelementptr inbounds [3 x float], [3 x float] addrspace(1)* %10, i64 0, i64 %11
    %12 = atomicrmw fadd float addrspace(1)* %output_element_address, float %8 seq_cst
    br label %output_inbound-after

  output_inbound-after:                             ; preds = %output_y_in_tile.loop_body.lr.ph, %output_y_in_tile.loop_exit
    ret void
  }

  ; Function Attrs: nounwind readnone
  declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1

  ; Function Attrs: nounwind
  declare void @llvm.stackprotector(i8*, i8**) #2

  attributes #0 = { nofree nounwind }
  attributes #1 = { nounwind readnone }
  attributes #2 = { nounwind }

  !nvvm.annotations = !{!0, !1}
  !llvm.module.flags = !{!2}

  !0 = !{void (i8*, i8*)* @reduce_7, !"kernel", i32 1}
  !1 = !{void (i8*, i8*)* @reduce_7, !"reqntidx", i32 1024}
  !2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
  !3 = !{i32 0, i32 1024}
  !4 = !{}
  !5 = !{!6, !8}
  !6 = !{!"buffer: {index:1, offset:0, size:12}", !7}
  !7 = !{!"XLA global AA domain"}
  !8 = !{!"buffer: {index:2, offset:0, size:4}", !7}
  !9 = distinct !{!9, !10}
  !10 = !{!"llvm.loop.vectorize.enable", i1 false}

https://reviews.llvm.org/D73756

Files:
  llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
  llvm/test/Transforms/InstCombine/icmp-add.ll
  llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
  llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
  llvm/test/Transforms/LoopVectorize/runtime-check.ll

-------------- next part --------------
A non-text attachment was scrubbed...
Name: D73756.241616.patch
Type: text/x-patch
Size: 14623 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20200131/8e9ab68c/attachment.bin>