[PATCH] D73756: Revert of [InstCombine] Revert rL341831: relax one-use check in foldICmpAddCons… …tant() (PR44100)
George Karpenkov via Phabricator via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 30 17:37:38 PST 2020
george.karpenkov created this revision.
Herald added subscribers: llvm-commits, sanjoy.google, Charusso, jfb, javed.absar, zzheng, hiraditya.
Herald added a project: LLVM.
Original commit message:
> rL341831 <https://reviews.llvm.org/rL341831> moved one-use check higher up, restricting a few folds
> that produced a single instruction from two instructions to the case
> where the inner instruction would go away.
>
> Original commit message:
>
>> InstCombine: move hasOneUse check to the top of foldICmpAddConstant
>>
>> There were two combines not covered by the check before now,
>> neither of which actually differed from normal in the benefit analysis.
>>
>> The most recent seems to be because it was just added at the top of the
>> function (naturally). The older is from way back in 2008 (r46687)
>> when we just didn't put those checks in so routinely, and has been
>> diligently maintained since.
>
> From the commit message alone, there doesn't seem to be a
> deeper motivation, deeper problem that was trying to solve,
> other than 'fixing the wrong one-use check'.
>
> As i have briefly discusses in IRC with Tim, the original motivation
> can no longer be recovered, too much time has passed.
>
> However i believe that the original fold was doing the right thing,
> we should be performing such a transformation even if the inner `add`
> will not go away - that will still unchain the comparison from `add`,
> it will no longer need to wait for `add` to compute.
>
> Doing so doesn't seem to break any particular idioms,
> as least as far as i can see.
>
> References https://bugs.llvm.org/show_bug.cgi?id=44100
The reverted causes OOB accesses in GPU code (for NVidia) compiled with LLVM.
For the input:
; ModuleID = 'ReduceR2_1000x1500_To_R1.8'
source_filename = "ReduceR2_1000x1500_To_R1.8"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
@buffer_for_constant_2 = constant [4 x i8] zeroinitializer, align 64
define void @reduce_7(i8* align 16 dereferenceable(1164) %alloc0, i8* align 64 dereferenceable(12) %alloc1) {
entry:
%output_y_in_tile.invar_address = alloca i32
%input.1.raw = getelementptr inbounds i8, i8* %alloc0, i64 0
%input.1.typed = bitcast i8* %input.1.raw to [97 x [3 x float]]*
%reduce.7.raw = getelementptr inbounds i8, i8* %alloc1, i64 0
%reduce.7.typed = bitcast i8* %reduce.7.raw to [3 x float]*
%0 = alloca float
%partial_reduction_result.0 = alloca float
%1 = load float, float* bitcast ([4 x i8]* @buffer_for_constant_2 to float*), !invariant.load !2, !alias.scope !3, !noalias !6
%2 = getelementptr inbounds float, float* %partial_reduction_result.0, i32 0
store float %1, float* %2
%3 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !8
%thread_id.x = urem i32 %3, 32
%thread_id.y = udiv i32 %3, 32
%lane_id = urem i32 %3, 32
%4 = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !9
%5 = udiv i32 %4, 1
%6 = urem i32 %5, 1
%7 = udiv i32 %4, 1
%8 = urem i32 %7, 1
%9 = udiv i32 %4, 1
%block_origin.z = mul i32 %9, 1
%10 = icmp eq i32 %8, 0
%tile_bound = select i1 %10, i32 97, i32 4096
%11 = icmp eq i32 %6, 0
%tile_bound1 = select i1 %11, i32 3, i32 32
%tile_origin.1 = mul i32 %8, 4096
%tile_origin.2 = mul i32 %6, 32
%12 = add i32 %tile_origin.2, %thread_id.x
store i32 %thread_id.y, i32* %output_y_in_tile.invar_address
br label %output_y_in_tile.loop_header
output_y_in_tile.loop_header: ; preds = %output_x_in_tile-after, %entry
%output_y_in_tile.indvar = load i32, i32* %output_y_in_tile.invar_address
%13 = icmp uge i32 %output_y_in_tile.indvar, %tile_bound
br i1 %13, label %output_y_in_tile.loop_exit, label %output_y_in_tile.loop_body
output_y_in_tile.loop_body: ; preds = %output_y_in_tile.loop_header
%invar.inc = add nuw nsw i32 %output_y_in_tile.indvar, 32
store i32 %invar.inc, i32* %output_y_in_tile.invar_address
%14 = icmp eq i32 %output_y_in_tile.indvar, %thread_id.y
%15 = add i32 %tile_origin.1, %output_y_in_tile.indvar
%x_loc = add i32 0, %thread_id.x
%16 = add i32 %12, 0
%17 = icmp ult i32 %x_loc, %tile_bound1
br i1 %17, label %output_x_in_tile-true, label %output_x_in_tile-false
output_x_in_tile-after: ; preds = %output_x_in_tile-false, %output_x_in_tile-true
br label %output_y_in_tile.loop_header, !llvm.loop !10
output_y_in_tile.loop_exit: ; preds = %output_y_in_tile.loop_header
%18 = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !8
%thread_id.x2 = urem i32 %18, 32
%thread_id.y3 = udiv i32 %18, 32
%lane_id4 = urem i32 %18, 32
%19 = add i32 %tile_origin.1, %thread_id.y3
%20 = add i32 %tile_origin.2, %thread_id.x2
%21 = icmp ult i32 %thread_id.x2, %tile_bound1
%22 = icmp ult i32 %thread_id.y3, %tile_bound
%23 = and i1 %21, %22
br i1 %23, label %output_inbound-true, label %output_inbound-false
output_inbound-after: ; preds = %output_inbound-false, %output_inbound-true
ret void
output_x_in_tile-true: ; preds = %output_y_in_tile.loop_body
%24 = mul nuw nsw i32 %16, 1
%25 = add nuw nsw i32 0, %24
%26 = mul nuw nsw i32 %15, 3
%27 = add nuw nsw i32 %25, %26
%28 = mul nuw nsw i32 %block_origin.z, 291
%29 = add nuw nsw i32 %27, %28
%30 = udiv i32 %29, 1
%31 = urem i32 %30, 3
%32 = udiv i32 %29, 3
%33 = bitcast [97 x [3 x float]]* %input.1.typed to float*
%34 = getelementptr inbounds float, float* %33, i32 %29
%35 = load float, float* %34, !invariant.load !2, !noalias !12
store float %35, float* %0
%36 = getelementptr inbounds float, float* %partial_reduction_result.0, i32 0
call void @add_F32_3(float* %36, float* %0, float* %36, i8* null)
%37 = load float, float* %36
br label %output_x_in_tile-after
output_x_in_tile-false: ; preds = %output_y_in_tile.loop_body
br label %output_x_in_tile-after
output_inbound-true: ; preds = %output_y_in_tile.loop_exit
%38 = add i32 %20, 0
%39 = mul i32 %block_origin.z, 3
%40 = add i32 %39, %38
%41 = udiv i32 %40, 1
%output_element_address = getelementptr inbounds [3 x float], [3 x float]* %reduce.7.typed, i32 0, i32 %41
%42 = getelementptr inbounds float, float* %partial_reduction_result.0, i32 0
%source = load float, float* %42
%43 = atomicrmw fadd float* %output_element_address, float %source seq_cst
br label %output_inbound-after
output_inbound-false: ; preds = %output_y_in_tile.loop_exit
br label %output_inbound-after
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #0
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #0
define internal void @add_F32_3(float* dereferenceable(4) %lhs.4.typed, float* dereferenceable(4) %rhs.5.typed, float* dereferenceable(4) %output_arg, i8* %temp_buffer) {
entry:
%add.6.typed = alloca float
%0 = load float, float* %lhs.4.typed, !alias.scope !13, !noalias !15
%1 = load float, float* %rhs.5.typed, !alias.scope !17, !noalias !15
%2 = fadd float %0, %1
store float %2, float* %add.6.typed, !alias.scope !15
%load_ret_value = load float, float* %add.6.typed
store float %load_ret_value, float* %output_arg
ret void
}
attributes #0 = { nounwind readnone }
!nvvm.annotations = !{!0, !1}
!0 = !{void (i8*, i8*)* @reduce_7, !"kernel", i32 1}
!1 = !{void (i8*, i8*)* @reduce_7, !"reqntidx", i32 1024}
!2 = !{}
!3 = !{!4}
!4 = !{!"buffer: {index:2, offset:0, size:4}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!7}
!7 = !{!"buffer: {index:1, offset:0, size:12}", !5}
!8 = !{i32 0, i32 1024}
!9 = !{i32 0, i32 1}
!10 = distinct !{!10, !11}
!11 = !{!"llvm.loop.vectorize.enable", i1 false}
!12 = !{!7, !4}
!13 = !{!14}
!14 = !{!"buffer: {index:3, offset:0, size:4}", !5}
!15 = !{!16}
!16 = !{!"buffer: {index:5, offset:0, size:4}", !5}
!17 = !{!18}
!18 = !{!"buffer: {index:4, offset:0, size:4}", !5}
with the revert (good) I get:
; ModuleID = 'ReduceR2_1000x1500_To_R1.8'
source_filename = "ReduceR2_1000x1500_To_R1.8"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
@buffer_for_constant_2 = local_unnamed_addr addrspace(1) constant [4 x i8] zeroinitializer, align 64
; Function Attrs: nofree nounwind
define void @reduce_7(i8* nocapture readonly align 16 dereferenceable(1164) %alloc0, i8* nocapture align 64 dereferenceable(12) %alloc1) local_unnamed_addr #0 {
output_y_in_tile.loop_body.lr.ph:
%alloc111 = addrspacecast i8* %alloc1 to i8 addrspace(1)*
%alloc09 = addrspacecast i8* %alloc0 to i8 addrspace(1)*
%0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
%thread_id.x = and i32 %0, 31
%1 = icmp ult i32 %thread_id.x, 3
br i1 %1, label %output_y_in_tile.loop_body.us.preheader, label %output_inbound-after
output_y_in_tile.loop_body.us.preheader: ; preds = %output_y_in_tile.loop_body.lr.ph
%thread_id.y = lshr i32 %0, 5
%2 = mul nuw nsw i32 %thread_id.y, 3
%3 = add i32 %2, %thread_id.x
%4 = zext i32 %3 to i64
%5 = shl nuw nsw i64 %4, 2
%scevgep = getelementptr i8, i8 addrspace(1)* %alloc09, i64 %5
br label %output_y_in_tile.loop_body.us
output_y_in_tile.loop_body.us: ; preds = %output_y_in_tile.loop_body.us.preheader, %output_y_in_tile.loop_body.us
%lsr.iv = phi i8 addrspace(1)* [ %scevgep, %output_y_in_tile.loop_body.us.preheader ], [ %scevgep13, %output_y_in_tile.loop_body.us ]
%output_y_in_tile.invar_address.06.us = phi i32 [ %invar.inc.us, %output_y_in_tile.loop_body.us ], [ %thread_id.y, %output_y_in_tile.loop_body.us.preheader ]
%partial_reduction_result.0.05.us = phi float [ %7, %output_y_in_tile.loop_body.us ], [ 0.000000e+00, %output_y_in_tile.loop_body.us.preheader ]
%lsr.iv14 = bitcast i8 addrspace(1)* %lsr.iv to float addrspace(1)*
%invar.inc.us = add nuw nsw i32 %output_y_in_tile.invar_address.06.us, 32
%6 = load float, float addrspace(1)* %lsr.iv14, align 4, !invariant.load !4, !noalias !5
%7 = fadd float %partial_reduction_result.0.05.us, %6
%scevgep13 = getelementptr i8, i8 addrspace(1)* %lsr.iv, i64 384
%8 = icmp ugt i32 %invar.inc.us, 96
br i1 %8, label %output_y_in_tile.loop_exit, label %output_y_in_tile.loop_body.us, !llvm.loop !9
output_y_in_tile.loop_exit: ; preds = %output_y_in_tile.loop_body.us
%9 = bitcast i8 addrspace(1)* %alloc111 to [3 x float] addrspace(1)*
%10 = zext i32 %thread_id.x to i64
%output_element_address = getelementptr inbounds [3 x float], [3 x float] addrspace(1)* %9, i64 0, i64 %10
%11 = atomicrmw fadd float addrspace(1)* %output_element_address, float %7 seq_cst
br label %output_inbound-after
output_inbound-after: ; preds = %output_y_in_tile.loop_body.lr.ph, %output_y_in_tile.loop_exit
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
; Function Attrs: nounwind
declare void @llvm.stackprotector(i8*, i8**) #2
attributes #0 = { nofree nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
!nvvm.annotations = !{!0, !1}
!llvm.module.flags = !{!2}
!0 = !{void (i8*, i8*)* @reduce_7, !"kernel", i32 1}
!1 = !{void (i8*, i8*)* @reduce_7, !"reqntidx", i32 1024}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{i32 0, i32 1024}
!4 = !{}
!5 = !{!6, !8}
!6 = !{!"buffer: {index:1, offset:0, size:12}", !7}
!7 = !{!"XLA global AA domain"}
!8 = !{!"buffer: {index:2, offset:0, size:4}", !7}
!9 = distinct !{!9, !10}
!10 = !{!"llvm.loop.vectorize.enable", i1 false}
and without the revert (bad) I get:
; ModuleID = 'ReduceR2_1000x1500_To_R1.8'
source_filename = "ReduceR2_1000x1500_To_R1.8"
target datalayout = "e-i64:64-i128:128-v16:16-v32:32-n16:32:64"
target triple = "nvptx64-nvidia-cuda"
@buffer_for_constant_2 = local_unnamed_addr addrspace(1) constant [4 x i8] zeroinitializer, align 64
; Function Attrs: nofree nounwind
define void @reduce_7(i8* nocapture readonly align 16 dereferenceable(1164) %alloc0, i8* nocapture align 64 dereferenceable(12) %alloc1) local_unnamed_addr #0 {
output_y_in_tile.loop_body.lr.ph:
%alloc111 = addrspacecast i8* %alloc1 to i8 addrspace(1)*
%alloc09 = addrspacecast i8* %alloc0 to i8 addrspace(1)*
%0 = tail call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
%thread_id.x = and i32 %0, 31
%1 = icmp ult i32 %thread_id.x, 3
br i1 %1, label %output_y_in_tile.loop_body.us.preheader, label %output_inbound-after
output_y_in_tile.loop_body.us.preheader: ; preds = %output_y_in_tile.loop_body.lr.ph
%thread_id.y = lshr i32 %0, 5
%2 = add nuw nsw i32 %thread_id.y, -32
%3 = mul nuw nsw i32 %thread_id.y, 3
%4 = add i32 %3, %thread_id.x
%5 = zext i32 %4 to i64
%6 = shl nuw nsw i64 %5, 2
%scevgep = getelementptr i8, i8 addrspace(1)* %alloc09, i64 %6
br label %output_y_in_tile.loop_body.us
output_y_in_tile.loop_body.us: ; preds = %output_y_in_tile.loop_body.us.preheader, %output_y_in_tile.loop_body.us
%lsr.iv13 = phi i8 addrspace(1)* [ %scevgep, %output_y_in_tile.loop_body.us.preheader ], [ %scevgep14, %output_y_in_tile.loop_body.us ]
%lsr.iv = phi i32 [ %2, %output_y_in_tile.loop_body.us.preheader ], [ %lsr.iv.next, %output_y_in_tile.loop_body.us ]
%partial_reduction_result.0.05.us = phi float [ %8, %output_y_in_tile.loop_body.us ], [ 0.000000e+00, %output_y_in_tile.loop_body.us.preheader ]
%lsr.iv1315 = bitcast i8 addrspace(1)* %lsr.iv13 to float addrspace(1)*
%7 = load float, float addrspace(1)* %lsr.iv1315, align 4, !invariant.load !4, !noalias !5
%8 = fadd float %partial_reduction_result.0.05.us, %7
%lsr.iv.next = add nsw i32 %lsr.iv, 32
%scevgep14 = getelementptr i8, i8 addrspace(1)* %lsr.iv13, i64 384
%9 = icmp ugt i32 %lsr.iv.next, 64
br i1 %9, label %output_y_in_tile.loop_exit, label %output_y_in_tile.loop_body.us, !llvm.loop !9
output_y_in_tile.loop_exit: ; preds = %output_y_in_tile.loop_body.us
%10 = bitcast i8 addrspace(1)* %alloc111 to [3 x float] addrspace(1)*
%11 = zext i32 %thread_id.x to i64
%output_element_address = getelementptr inbounds [3 x float], [3 x float] addrspace(1)* %10, i64 0, i64 %11
%12 = atomicrmw fadd float addrspace(1)* %output_element_address, float %8 seq_cst
br label %output_inbound-after
output_inbound-after: ; preds = %output_y_in_tile.loop_body.lr.ph, %output_y_in_tile.loop_exit
ret void
}
; Function Attrs: nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
; Function Attrs: nounwind
declare void @llvm.stackprotector(i8*, i8**) #2
attributes #0 = { nofree nounwind }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind }
!nvvm.annotations = !{!0, !1}
!llvm.module.flags = !{!2}
!0 = !{void (i8*, i8*)* @reduce_7, !"kernel", i32 1}
!1 = !{void (i8*, i8*)* @reduce_7, !"reqntidx", i32 1024}
!2 = !{i32 4, !"nvvm-reflect-ftz", i32 0}
!3 = !{i32 0, i32 1024}
!4 = !{}
!5 = !{!6, !8}
!6 = !{!"buffer: {index:1, offset:0, size:12}", !7}
!7 = !{!"XLA global AA domain"}
!8 = !{!"buffer: {index:2, offset:0, size:4}", !7}
!9 = distinct !{!9, !10}
!10 = !{!"llvm.loop.vectorize.enable", i1 false}
https://reviews.llvm.org/D73756
Files:
llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
llvm/test/Transforms/InstCombine/icmp-add.ll
llvm/test/Transforms/LoopUnroll/runtime-loop-multiple-exits.ll
llvm/test/Transforms/LoopVectorize/if-conversion-nest.ll
llvm/test/Transforms/LoopVectorize/runtime-check.ll
-------------- next part --------------
A non-text attachment was scrubbed...
Name: D73756.241616.patch
Type: text/x-patch
Size: 14623 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20200131/8e9ab68c/attachment.bin>
More information about the llvm-commits
mailing list