[LLVMdev] How to unroll reduction loop with caching accumulator on register?
Dmitry Mikushin
dmitry at kernelgen.org
Tue Mar 26 17:11:26 PDT 2013
Just for record, here's what I was doing wrong.
!0 = metadata !{metadata !"output", null}
!1 = metadata !{metadata !"input1", null}
!2 = metadata !{metadata !"input2", null}
should be
!0 = metadata !{ }
!1 = metadata !{ metadata !"output", metadata !0 }
!2 = metadata !{ metadata !"input1", metadata !0 }
!3 = metadata !{ metadata !"input2", metadata !0 }
with the corresponding renaming of nodes.
With this metadata, opt -O3 successfully pull store out of the loop:
; ModuleID = 'check.ll'
target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-unknown"
@__kernelgen_version = constant [15 x i8] c"0.2/1654:1675M\00"
define ptx_kernel void @__kernelgen_matvec_loop_7(i32* nocapture) nounwind
alwaysinline {
"Loop Function Root":
%tid.x = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
%ctaid.x = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
%PositionOfBlockInGrid.x = shl i32 %ctaid.x, 9
%BlockLB.Add.ThreadPosInBlock.x = add i32 %PositionOfBlockInGrid.x, %tid.x
%isThreadLBgtLoopUB.x = icmp sgt i32 %BlockLB.Add.ThreadPosInBlock.x,
65535
br i1 %isThreadLBgtLoopUB.x, label %CUDA.AfterLoop.x, label
%CUDA.LoopHeader.x.preheader
CUDA.LoopHeader.x.preheader: ; preds = %"Loop Function
Root"
%1 = sext i32 %BlockLB.Add.ThreadPosInBlock.x to i64
store float 0.000000e+00, float* inttoptr (i64 47380979712 to float*),
align 8192, !tbaa !0
%p_.moved.to.4.cloned = shl nsw i64 %1, 9
br label %polly.loop_body
CUDA.AfterLoop.x.loopexit: ; preds = %polly.loop_body
store float %p_8, float* inttoptr (i64 47380979712 to float*), align 8192
br label %CUDA.AfterLoop.x
CUDA.AfterLoop.x: ; preds =
%CUDA.AfterLoop.x.loopexit, %"Loop Function Root"
ret void
polly.loop_body: ; preds =
%polly.loop_body, %CUDA.LoopHeader.x.preheader
%_p_scalar_ = phi float [ 0.000000e+00, %CUDA.LoopHeader.x.preheader ], [
%p_8, %polly.loop_body ]
%polly.loopiv10 = phi i64 [ 0, %CUDA.LoopHeader.x.preheader ], [
%polly.next_loopiv, %polly.loop_body ]
%polly.next_loopiv = add i64 %polly.loopiv10, 1
%p_ = add i64 %polly.loopiv10, %p_.moved.to.4.cloned
%p_newGEPInst9.cloned = getelementptr float* inttoptr (i64 47246749696 to
float*), i64 %p_
%p_newGEPInst12.cloned = getelementptr float* inttoptr (i64 47380971520
to float*), i64 %polly.loopiv10
%_p_scalar_5 = load float* %p_newGEPInst9.cloned, align 4, !tbaa !2
%_p_scalar_6 = load float* %p_newGEPInst12.cloned, align 4, !tbaa !3
%p_7 = fmul float %_p_scalar_5, %_p_scalar_6
%p_8 = fadd float %_p_scalar_, %p_7
%exitcond = icmp eq i64 %polly.next_loopiv, 512
br i1 %exitcond, label %CUDA.AfterLoop.x.loopexit, label %polly.loop_body
}
declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone
declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone
!0 = metadata !{metadata !"output", metadata !1}
!1 = metadata !{}
!2 = metadata !{metadata !"input1", metadata !1}
!3 = metadata !{metadata !"input2", metadata !1}
2013/3/11 Dmitry Mikushin <dmitry at kernelgen.org>
> I tried to manually assign each of 3 arrays a unique TBAA node. But it
> does not seem to help: alias analysis still considers arrays as may-alias,
> which most likely prevents the desired optimization. Below is the sample
> code with TBAA metadata inserted. Could you please suggest what might be
> wrong with it?
>
> Many thanks,
> - D.
>
> marcusmae at M17xR4:~/forge/llvm$ opt -time-passes -enable-tbaa -tbaa
> -print-alias-sets -O3 check.ll -o - -S
> Alias Set Tracker: 1 alias sets for 3 pointer values.
> AliasSet[0x39046c0, 3] may alias, Mod/Ref Pointers: (float* inttoptr
> (i64 47380979712 to float*), 4), (float* %p_newGEPInst9.cloned, 4), (float*
> %p_newGEPInst12.cloned, 4)
>
> ; ModuleID = 'check.ll'
> target datalayout =
> "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
> target triple = "nvptx64-unknown-unknown"
>
> @__kernelgen_version = constant [15 x i8] c"0.2/1654:1675M\00"
>
> define ptx_kernel void @__kernelgen_matvec_loop_7(i32* nocapture) #0 {
> "Loop Function Root":
> %tid.x = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
> %ctaid.x = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
> %PositionOfBlockInGrid.x = shl i32 %ctaid.x, 9
> %BlockLB.Add.ThreadPosInBlock.x = add i32 %PositionOfBlockInGrid.x,
> %tid.x
> %isThreadLBgtLoopUB.x = icmp sgt i32 %BlockLB.Add.ThreadPosInBlock.x,
> 65535
> br i1 %isThreadLBgtLoopUB.x, label %CUDA.AfterLoop.x, label
> %CUDA.LoopHeader.x.preheader
>
> CUDA.LoopHeader.x.preheader: ; preds = %"Loop
> Function Root"
> %1 = sext i32 %BlockLB.Add.ThreadPosInBlock.x to i64
> store float 0.000000e+00, float* inttoptr (i64 47380979712 to float*),
> align 8192, !tbaa !0
> %p_.moved.to.4.cloned = shl nsw i64 %1, 9
> br label %polly.loop_body
>
> CUDA.AfterLoop.x: ; preds =
> %polly.loop_body, %"Loop Function Root"
> ret void
>
> polly.loop_body: ; preds =
> %polly.loop_body, %CUDA.LoopHeader.x.preheader
> %_p_scalar_ = phi float [ 0.000000e+00, %CUDA.LoopHeader.x.preheader ],
> [ %p_8, %polly.loop_body ]
> %polly.loopiv10 = phi i64 [ 0, %CUDA.LoopHeader.x.preheader ], [
> %polly.next_loopiv, %polly.loop_body ]
> %polly.next_loopiv = add i64 %polly.loopiv10, 1
> %p_ = add i64 %polly.loopiv10, %p_.moved.to.4.cloned
> %p_newGEPInst9.cloned = getelementptr float* inttoptr (i64 47246749696
> to float*), i64 %p_
> %p_newGEPInst12.cloned = getelementptr float* inttoptr (i64 47380971520
> to float*), i64 %polly.loopiv10
> %_p_scalar_5 = load float* %p_newGEPInst9.cloned, align 4, !tbaa !1
> %_p_scalar_6 = load float* %p_newGEPInst12.cloned, align 4, !tbaa !2
> %p_7 = fmul float %_p_scalar_5, %_p_scalar_6
> %p_8 = fadd float %_p_scalar_, %p_7
> store float %p_8, float* inttoptr (i64 47380979712 to float*), align
> 8192, !tbaa !0
> %exitcond = icmp eq i64 %polly.next_loopiv, 512
> br i1 %exitcond, label %CUDA.AfterLoop.x, label %polly.loop_body
> }
>
> declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
>
> declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
>
> attributes #0 = { alwaysinline nounwind }
> attributes #1 = { nounwind readnone }
>
> !0 = metadata !{metadata !"output", null}
> !1 = metadata !{metadata !"input1", null}
> !2 = metadata !{metadata !"input2", null}
>
> ===-------------------------------------------------------------------------===
> ... Pass execution timing report ...
>
> ===-------------------------------------------------------------------------===
> Total Execution Time: 0.0080 seconds (0.0082 wall clock)
>
> ---User Time--- --User+System-- ---Wall Time--- --- Name ---
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0020 ( 24.5%) Print module to
> stderr
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0006 ( 7.9%) Induction Variable
> Simplification
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0006 ( 7.7%) Combine redundant
> instructions
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0004 ( 5.2%) Combine redundant
> instructions
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0004 ( 5.1%) Alias Set Printer
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0003 ( 3.8%) Combine redundant
> instructions
> 0.0040 ( 50.0%) 0.0040 ( 50.0%) 0.0003 ( 3.8%) Combine redundant
> instructions
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0003 ( 3.8%) Global Value
> Numbering
> 0.0040 ( 50.0%) 0.0040 ( 50.0%) 0.0003 ( 3.7%) Combine redundant
> instructions
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0002 ( 2.9%) Early CSE
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0002 ( 2.0%) Reassociate
> expressions
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 1.7%) Early CSE
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 1.6%) Natural Loop
> Information
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 1.6%) Interprocedural
> Sparse Conditional Constant Propagation
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 1.4%) Loop Invariant
> Code Motion
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 1.4%) Module Verifier
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 1.2%) Simplify the CFG
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 1.1%) Value Propagation
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 1.0%) Sparse Conditional
> Constant Propagation
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 1.0%) Canonicalize
> natural loops
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 1.0%) Dead Store
> Elimination
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 0.9%) Module Verifier
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 0.8%) Value Propagation
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 0.8%) Simplify the CFG
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 0.7%) Deduce function
> attributes
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 0.7%) Remove unused
> exception handling info
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 0.6%) Simplify the CFG
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0001 ( 0.6%) Jump Threading
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.6%) Simplify the CFG
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.6%) Simplify the CFG
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.6%) Dominator Tree
> Construction
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.6%) Function
> Integration/Inlining
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.5%) Jump Threading
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.5%) Canonicalize
> natural loops
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.5%) Unswitch loops
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.4%) MemCpy Optimization
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.4%) Dominator Tree
> Construction
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.4%) Loop-Closed SSA
> Form Pass
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.3%) Recognize loop
> idioms
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.3%) Dominator Tree
> Construction
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.3%) Scalar Evolution
> Analysis
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.3%) Dominator Tree
> Construction
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.3%) Basic CallGraph
> Construction
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.3%) Dominator Tree
> Construction
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.3%) Dominator Tree
> Construction
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.3%) Unroll loops
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.3%) Aggressive Dead
> Code Elimination
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.3%) Global Variable
> Optimizer
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.3%) Loop-Closed SSA
> Form Pass
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.2%) Loop-Closed SSA
> Form Pass
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.2%) Inline Cost
> Analysis
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.2%) Tail Call
> Elimination
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.2%) Lazy Value
> Information Analysis
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.2%) Lazy Value
> Information Analysis
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.2%) Dead Argument
> Elimination
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.2%) Dead Global
> Elimination
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.1%) No target
> information
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.1%) Target independent
> code generator's TTI
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.1%) Merge Duplicate
> Global Constants
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.1%) Simplify
> well-known library calls
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.1%) Memory Dependence
> Analysis
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.1%) Delete dead loops
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.1%) SROA
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.1%) Memory Dependence
> Analysis
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.1%) Basic Alias
> Analysis (stateless AA impl)
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.1%) SROA
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.1%) Memory Dependence
> Analysis
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.1%) Lower 'expect'
> Intrinsics
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.1%) Rotate Loops
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.1%) Promote 'by
> reference' arguments to scalars
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.1%) Preliminary module
> verification
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.1%) No Alias Analysis
> (always returns 'may' alias)
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.0%) No target
> information
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.0%) Target Library
> Information
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.0%) Strip Unused
> Function Prototypes
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.0%) No Alias Analysis
> (always returns 'may' alias)
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.0%) Type-Based Alias
> Analysis
> 0.0000 ( 0.0%) 0.0000 ( 0.0%) 0.0000 ( 0.0%) Preliminary module
> verification
> 0.0080 (100.0%) 0.0080 (100.0%) 0.0082 (100.0%) Total
>
>
> 2013/3/11 Dmitry Mikushin <dmitry at kernelgen.org>
>
>> Dear all,
>>
>> Attached notunrolled.ll is a module containing reduction kernel. What I'm
>> trying to do is to unroll it in such way, that partial reduction on
>> unrolled iterations would be performed on register, and then stored to
>> memory only once. Currently llvm's unroller together with all standard
>> optimizations produce code, which stores value to memory after every
>> unrolled iteration, which is much less efficient. Do you have an idea which
>> combination of opt passes may help to cache unrolled loop stores on a
>> register?
>>
>> Many thanks,
>> - D.
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20130327/cbc95b17/attachment.html>
More information about the llvm-dev
mailing list