[LLVMdev] How to unroll reduction loop with caching accumulator on register?

Tue Mar 26 17:11:26 PDT 2013

Just for record, here's what I was doing wrong.

!0 = metadata !{metadata !"output", null}
!1 = metadata !{metadata !"input1", null}
!2 = metadata !{metadata !"input2", null}

should be

!0 = metadata !{ }
!1 = metadata !{ metadata !"output", metadata !0 }
!2 = metadata !{ metadata !"input1", metadata !0 }
!3 = metadata !{ metadata !"input2", metadata !0 }

with the corresponding renaming of nodes.

With this metadata, opt -O3 successfully pull store out of the loop:

; ModuleID = 'check.ll'
target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-unknown"

@__kernelgen_version = constant [15 x i8] c"0.2/1654:1675M\00"

define ptx_kernel void @__kernelgen_matvec_loop_7(i32* nocapture) nounwind
alwaysinline {
"Loop Function Root":
  %tid.x = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  %ctaid.x = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  %PositionOfBlockInGrid.x = shl i32 %ctaid.x, 9
  %BlockLB.Add.ThreadPosInBlock.x = add i32 %PositionOfBlockInGrid.x, %tid.x
  %isThreadLBgtLoopUB.x = icmp sgt i32 %BlockLB.Add.ThreadPosInBlock.x,
65535
  br i1 %isThreadLBgtLoopUB.x, label %CUDA.AfterLoop.x, label
%CUDA.LoopHeader.x.preheader

CUDA.LoopHeader.x.preheader:                      ; preds = %"Loop Function
Root"
  %1 = sext i32 %BlockLB.Add.ThreadPosInBlock.x to i64
  store float 0.000000e+00, float* inttoptr (i64 47380979712 to float*),
align 8192, !tbaa !0
  %p_.moved.to.4.cloned = shl nsw i64 %1, 9
  br label %polly.loop_body

CUDA.AfterLoop.x.loopexit:                        ; preds = %polly.loop_body
  store float %p_8, float* inttoptr (i64 47380979712 to float*), align 8192
  br label %CUDA.AfterLoop.x

CUDA.AfterLoop.x:                                 ; preds =
%CUDA.AfterLoop.x.loopexit, %"Loop Function Root"
  ret void

polly.loop_body:                                  ; preds =
%polly.loop_body, %CUDA.LoopHeader.x.preheader
  %_p_scalar_ = phi float [ 0.000000e+00, %CUDA.LoopHeader.x.preheader ], [
%p_8, %polly.loop_body ]
  %polly.loopiv10 = phi i64 [ 0, %CUDA.LoopHeader.x.preheader ], [
%polly.next_loopiv, %polly.loop_body ]
  %polly.next_loopiv = add i64 %polly.loopiv10, 1
  %p_ = add i64 %polly.loopiv10, %p_.moved.to.4.cloned
  %p_newGEPInst9.cloned = getelementptr float* inttoptr (i64 47246749696 to
float*), i64 %p_
  %p_newGEPInst12.cloned = getelementptr float* inttoptr (i64 47380971520
to float*), i64 %polly.loopiv10
  %_p_scalar_5 = load float* %p_newGEPInst9.cloned, align 4, !tbaa !2
  %_p_scalar_6 = load float* %p_newGEPInst12.cloned, align 4, !tbaa !3
  %p_7 = fmul float %_p_scalar_5, %_p_scalar_6
  %p_8 = fadd float %_p_scalar_, %p_7
  %exitcond = icmp eq i64 %polly.next_loopiv, 512
  br i1 %exitcond, label %CUDA.AfterLoop.x.loopexit, label %polly.loop_body
}

declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() nounwind readnone

declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() nounwind readnone

!0 = metadata !{metadata !"output", metadata !1}
!1 = metadata !{}
!2 = metadata !{metadata !"input1", metadata !1}
!3 = metadata !{metadata !"input2", metadata !1}

2013/3/11 Dmitry Mikushin <dmitry at kernelgen.org>

> I tried to manually assign each of 3 arrays a unique TBAA node. But it
> does not seem to help: alias analysis still considers arrays as may-alias,
> which most likely prevents the desired optimization. Below is the sample
> code with TBAA metadata inserted. Could you please suggest what might be
> wrong with it?
>
> Many thanks,
> - D.
>
> marcusmae at M17xR4:~/forge/llvm$ opt -time-passes -enable-tbaa -tbaa
> -print-alias-sets -O3 check.ll -o - -S
> Alias Set Tracker: 1 alias sets for 3 pointer values.
>   AliasSet[0x39046c0, 3] may alias, Mod/Ref   Pointers: (float* inttoptr
> (i64 47380979712 to float*), 4), (float* %p_newGEPInst9.cloned, 4), (float*
> %p_newGEPInst12.cloned, 4)
>
> ; ModuleID = 'check.ll'
> target datalayout =
> "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
> target triple = "nvptx64-unknown-unknown"
>
> @__kernelgen_version = constant [15 x i8] c"0.2/1654:1675M\00"
>
> define ptx_kernel void @__kernelgen_matvec_loop_7(i32* nocapture) #0 {
> "Loop Function Root":
>   %tid.x = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
>   %ctaid.x = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
>   %PositionOfBlockInGrid.x = shl i32 %ctaid.x, 9
>   %BlockLB.Add.ThreadPosInBlock.x = add i32 %PositionOfBlockInGrid.x,
> %tid.x
>   %isThreadLBgtLoopUB.x = icmp sgt i32 %BlockLB.Add.ThreadPosInBlock.x,
> 65535
>   br i1 %isThreadLBgtLoopUB.x, label %CUDA.AfterLoop.x, label
> %CUDA.LoopHeader.x.preheader
>
> CUDA.LoopHeader.x.preheader:                      ; preds = %"Loop
> Function Root"
>   %1 = sext i32 %BlockLB.Add.ThreadPosInBlock.x to i64
>   store float 0.000000e+00, float* inttoptr (i64 47380979712 to float*),
> align 8192, !tbaa !0
>   %p_.moved.to.4.cloned = shl nsw i64 %1, 9
>   br label %polly.loop_body
>
> CUDA.AfterLoop.x:                                 ; preds =
> %polly.loop_body, %"Loop Function Root"
>   ret void
>
> polly.loop_body:                                  ; preds =
> %polly.loop_body, %CUDA.LoopHeader.x.preheader
>   %_p_scalar_ = phi float [ 0.000000e+00, %CUDA.LoopHeader.x.preheader ],
> [ %p_8, %polly.loop_body ]
>   %polly.loopiv10 = phi i64 [ 0, %CUDA.LoopHeader.x.preheader ], [
> %polly.next_loopiv, %polly.loop_body ]
>   %polly.next_loopiv = add i64 %polly.loopiv10, 1
>   %p_ = add i64 %polly.loopiv10, %p_.moved.to.4.cloned
>   %p_newGEPInst9.cloned = getelementptr float* inttoptr (i64 47246749696
> to float*), i64 %p_
>   %p_newGEPInst12.cloned = getelementptr float* inttoptr (i64 47380971520
> to float*), i64 %polly.loopiv10
>   %_p_scalar_5 = load float* %p_newGEPInst9.cloned, align 4, !tbaa !1
>   %_p_scalar_6 = load float* %p_newGEPInst12.cloned, align 4, !tbaa !2
>   %p_7 = fmul float %_p_scalar_5, %_p_scalar_6
>   %p_8 = fadd float %_p_scalar_, %p_7
>   store float %p_8, float* inttoptr (i64 47380979712 to float*), align
> 8192, !tbaa !0
>   %exitcond = icmp eq i64 %polly.next_loopiv, 512
>   br i1 %exitcond, label %CUDA.AfterLoop.x, label %polly.loop_body
> }
>
> declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
>
> declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1
>
> attributes #0 = { alwaysinline nounwind }
> attributes #1 = { nounwind readnone }
>
> !0 = metadata !{metadata !"output", null}
> !1 = metadata !{metadata !"input1", null}
> !2 = metadata !{metadata !"input2", null}
>
> ===-------------------------------------------------------------------------===
>                       ... Pass execution timing report ...
>
> ===-------------------------------------------------------------------------===
>   Total Execution Time: 0.0080 seconds (0.0082 wall clock)
>
>    ---User Time---   --User+System--   ---Wall Time---  --- Name ---
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0020 ( 24.5%)  Print module to
> stderr
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0006 (  7.9%)  Induction Variable
> Simplification
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0006 (  7.7%)  Combine redundant
> instructions
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0004 (  5.2%)  Combine redundant
> instructions
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0004 (  5.1%)  Alias Set Printer
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0003 (  3.8%)  Combine redundant
> instructions
>    0.0040 ( 50.0%)   0.0040 ( 50.0%)   0.0003 (  3.8%)  Combine redundant
> instructions
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0003 (  3.8%)  Global Value
> Numbering
>    0.0040 ( 50.0%)   0.0040 ( 50.0%)   0.0003 (  3.7%)  Combine redundant
> instructions
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0002 (  2.9%)  Early CSE
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0002 (  2.0%)  Reassociate
> expressions
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.7%)  Early CSE
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.6%)  Natural Loop
> Information
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.6%)  Interprocedural
> Sparse Conditional Constant Propagation
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.4%)  Loop Invariant
> Code Motion
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.4%)  Module Verifier
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.2%)  Simplify the CFG
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.1%)  Value Propagation
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.0%)  Sparse Conditional
> Constant Propagation
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.0%)  Canonicalize
> natural loops
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.0%)  Dead Store
> Elimination
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  0.9%)  Module Verifier
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  0.8%)  Value Propagation
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  0.8%)  Simplify the CFG
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  0.7%)  Deduce function
> attributes
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  0.7%)  Remove unused
> exception handling info
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  0.6%)  Simplify the CFG
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  0.6%)  Jump Threading
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.6%)  Simplify the CFG
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.6%)  Simplify the CFG
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.6%)  Dominator Tree
> Construction
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.6%)  Function
> Integration/Inlining
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.5%)  Jump Threading
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.5%)  Canonicalize
> natural loops
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.5%)  Unswitch loops
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.4%)  MemCpy Optimization
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.4%)  Dominator Tree
> Construction
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.4%)  Loop-Closed SSA
> Form Pass
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Recognize loop
> idioms
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Dominator Tree
> Construction
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Scalar Evolution
> Analysis
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Dominator Tree
> Construction
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Basic CallGraph
> Construction
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Dominator Tree
> Construction
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Dominator Tree
> Construction
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Unroll loops
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Aggressive Dead
> Code Elimination
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Global Variable
> Optimizer
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Loop-Closed SSA
> Form Pass
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.2%)  Loop-Closed SSA
> Form Pass
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.2%)  Inline Cost
> Analysis
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.2%)  Tail Call
> Elimination
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.2%)  Lazy Value
> Information Analysis
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.2%)  Lazy Value
> Information Analysis
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.2%)  Dead Argument
> Elimination
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.2%)  Dead Global
> Elimination
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  No target
> information
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Target independent
> code generator's TTI
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Merge Duplicate
> Global Constants
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Simplify
> well-known library calls
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Memory Dependence
> Analysis
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Delete dead loops
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  SROA
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Memory Dependence
> Analysis
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Basic Alias
> Analysis (stateless AA impl)
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  SROA
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Memory Dependence
> Analysis
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Lower 'expect'
> Intrinsics
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Rotate Loops
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Promote 'by
> reference' arguments to scalars
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Preliminary module
> verification
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  No Alias Analysis
> (always returns 'may' alias)
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.0%)  No target
> information
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.0%)  Target Library
> Information
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.0%)  Strip Unused
> Function Prototypes
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.0%)  No Alias Analysis
> (always returns 'may' alias)
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.0%)  Type-Based Alias
> Analysis
>    0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.0%)  Preliminary module
> verification
>    0.0080 (100.0%)   0.0080 (100.0%)   0.0082 (100.0%)  Total
>
>
> 2013/3/11 Dmitry Mikushin <dmitry at kernelgen.org>
>
>> Dear all,
>>
>> Attached notunrolled.ll is a module containing reduction kernel. What I'm
>> trying to do is to unroll it in such way, that partial reduction on
>> unrolled iterations would be performed on register, and then stored to
>> memory only once. Currently llvm's unroller together with all standard
>> optimizations produce code, which stores value to memory after every
>> unrolled iteration, which is much less efficient. Do you have an idea which
>> combination of opt passes may help to cache unrolled loop stores on a
>> register?
>>
>> Many thanks,
>> - D.
>>
>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20130327/cbc95b17/attachment.html>