[LLVMdev] How to unroll reduction loop with caching accumulator on register?

Mon Mar 11 11:33:51 PDT 2013

I tried to manually assign each of 3 arrays a unique TBAA node. But it does
not seem to help: alias analysis still considers arrays as may-alias, which
most likely prevents the desired optimization. Below is the sample code
with TBAA metadata inserted. Could you please suggest what might be wrong
with it?

Many thanks,
- D.

marcusmae at M17xR4:~/forge/llvm$ opt -time-passes -enable-tbaa -tbaa
-print-alias-sets -O3 check.ll -o - -S
Alias Set Tracker: 1 alias sets for 3 pointer values.
  AliasSet[0x39046c0, 3] may alias, Mod/Ref   Pointers: (float* inttoptr
(i64 47380979712 to float*), 4), (float* %p_newGEPInst9.cloned, 4), (float*
%p_newGEPInst12.cloned, 4)

; ModuleID = 'check.ll'
target datalayout =
"e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
target triple = "nvptx64-unknown-unknown"

@__kernelgen_version = constant [15 x i8] c"0.2/1654:1675M\00"

define ptx_kernel void @__kernelgen_matvec_loop_7(i32* nocapture) #0 {
"Loop Function Root":
  %tid.x = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.tid.x()
  %ctaid.x = tail call ptx_device i32 @llvm.nvvm.read.ptx.sreg.ctaid.x()
  %PositionOfBlockInGrid.x = shl i32 %ctaid.x, 9
  %BlockLB.Add.ThreadPosInBlock.x = add i32 %PositionOfBlockInGrid.x, %tid.x
  %isThreadLBgtLoopUB.x = icmp sgt i32 %BlockLB.Add.ThreadPosInBlock.x,
65535
  br i1 %isThreadLBgtLoopUB.x, label %CUDA.AfterLoop.x, label
%CUDA.LoopHeader.x.preheader

CUDA.LoopHeader.x.preheader:                      ; preds = %"Loop Function
Root"
  %1 = sext i32 %BlockLB.Add.ThreadPosInBlock.x to i64
  store float 0.000000e+00, float* inttoptr (i64 47380979712 to float*),
align 8192, !tbaa !0
  %p_.moved.to.4.cloned = shl nsw i64 %1, 9
  br label %polly.loop_body

CUDA.AfterLoop.x:                                 ; preds =
%polly.loop_body, %"Loop Function Root"
  ret void

polly.loop_body:                                  ; preds =
%polly.loop_body, %CUDA.LoopHeader.x.preheader
  %_p_scalar_ = phi float [ 0.000000e+00, %CUDA.LoopHeader.x.preheader ], [
%p_8, %polly.loop_body ]
  %polly.loopiv10 = phi i64 [ 0, %CUDA.LoopHeader.x.preheader ], [
%polly.next_loopiv, %polly.loop_body ]
  %polly.next_loopiv = add i64 %polly.loopiv10, 1
  %p_ = add i64 %polly.loopiv10, %p_.moved.to.4.cloned
  %p_newGEPInst9.cloned = getelementptr float* inttoptr (i64 47246749696 to
float*), i64 %p_
  %p_newGEPInst12.cloned = getelementptr float* inttoptr (i64 47380971520
to float*), i64 %polly.loopiv10
  %_p_scalar_5 = load float* %p_newGEPInst9.cloned, align 4, !tbaa !1
  %_p_scalar_6 = load float* %p_newGEPInst12.cloned, align 4, !tbaa !2
  %p_7 = fmul float %_p_scalar_5, %_p_scalar_6
  %p_8 = fadd float %_p_scalar_, %p_7
  store float %p_8, float* inttoptr (i64 47380979712 to float*), align
8192, !tbaa !0
  %exitcond = icmp eq i64 %polly.next_loopiv, 512
  br i1 %exitcond, label %CUDA.AfterLoop.x, label %polly.loop_body
}

declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1

declare i32 @llvm.nvvm.read.ptx.sreg.ctaid.x() #1

attributes #0 = { alwaysinline nounwind }
attributes #1 = { nounwind readnone }

!0 = metadata !{metadata !"output", null}
!1 = metadata !{metadata !"input1", null}
!2 = metadata !{metadata !"input2", null}
===-------------------------------------------------------------------------===
                      ... Pass execution timing report ...
===-------------------------------------------------------------------------===
  Total Execution Time: 0.0080 seconds (0.0082 wall clock)

   ---User Time---   --User+System--   ---Wall Time---  --- Name ---
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0020 ( 24.5%)  Print module to
stderr
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0006 (  7.9%)  Induction Variable
Simplification
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0006 (  7.7%)  Combine redundant
instructions
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0004 (  5.2%)  Combine redundant
instructions
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0004 (  5.1%)  Alias Set Printer
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0003 (  3.8%)  Combine redundant
instructions
   0.0040 ( 50.0%)   0.0040 ( 50.0%)   0.0003 (  3.8%)  Combine redundant
instructions
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0003 (  3.8%)  Global Value
Numbering
   0.0040 ( 50.0%)   0.0040 ( 50.0%)   0.0003 (  3.7%)  Combine redundant
instructions
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0002 (  2.9%)  Early CSE
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0002 (  2.0%)  Reassociate
expressions
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.7%)  Early CSE
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.6%)  Natural Loop
Information
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.6%)  Interprocedural
Sparse Conditional Constant Propagation
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.4%)  Loop Invariant Code
Motion
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.4%)  Module Verifier
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.2%)  Simplify the CFG
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.1%)  Value Propagation
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.0%)  Sparse Conditional
Constant Propagation
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.0%)  Canonicalize
natural loops
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  1.0%)  Dead Store
Elimination
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  0.9%)  Module Verifier
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  0.8%)  Value Propagation
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  0.8%)  Simplify the CFG
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  0.7%)  Deduce function
attributes
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  0.7%)  Remove unused
exception handling info
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  0.6%)  Simplify the CFG
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0001 (  0.6%)  Jump Threading
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.6%)  Simplify the CFG
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.6%)  Simplify the CFG
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.6%)  Dominator Tree
Construction
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.6%)  Function
Integration/Inlining
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.5%)  Jump Threading
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.5%)  Canonicalize
natural loops
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.5%)  Unswitch loops
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.4%)  MemCpy Optimization
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.4%)  Dominator Tree
Construction
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.4%)  Loop-Closed SSA
Form Pass
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Recognize loop
idioms
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Dominator Tree
Construction
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Scalar Evolution
Analysis
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Dominator Tree
Construction
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Basic CallGraph
Construction
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Dominator Tree
Construction
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Dominator Tree
Construction
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Unroll loops
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Aggressive Dead
Code Elimination
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Global Variable
Optimizer
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.3%)  Loop-Closed SSA
Form Pass
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.2%)  Loop-Closed SSA
Form Pass
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.2%)  Inline Cost Analysis
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.2%)  Tail Call
Elimination
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.2%)  Lazy Value
Information Analysis
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.2%)  Lazy Value
Information Analysis
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.2%)  Dead Argument
Elimination
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.2%)  Dead Global
Elimination
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  No target
information
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Target independent
code generator's TTI
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Merge Duplicate
Global Constants
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Simplify well-known
library calls
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Memory Dependence
Analysis
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Delete dead loops
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  SROA
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Memory Dependence
Analysis
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Basic Alias
Analysis (stateless AA impl)
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  SROA
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Memory Dependence
Analysis
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Lower 'expect'
Intrinsics
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Rotate Loops
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Promote 'by
reference' arguments to scalars
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  Preliminary module
verification
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.1%)  No Alias Analysis
(always returns 'may' alias)
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.0%)  No target
information
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.0%)  Target Library
Information
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.0%)  Strip Unused
Function Prototypes
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.0%)  No Alias Analysis
(always returns 'may' alias)
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.0%)  Type-Based Alias
Analysis
   0.0000 (  0.0%)   0.0000 (  0.0%)   0.0000 (  0.0%)  Preliminary module
verification
   0.0080 (100.0%)   0.0080 (100.0%)   0.0082 (100.0%)  Total

2013/3/11 Dmitry Mikushin <dmitry at kernelgen.org>

> Dear all,
>
> Attached notunrolled.ll is a module containing reduction kernel. What I'm
> trying to do is to unroll it in such way, that partial reduction on
> unrolled iterations would be performed on register, and then stored to
> memory only once. Currently llvm's unroller together with all standard
> optimizations produce code, which stores value to memory after every
> unrolled iteration, which is much less efficient. Do you have an idea which
> combination of opt passes may help to cache unrolled loop stores on a
> register?
>
> Many thanks,
> - D.
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.llvm.org/pipermail/llvm-dev/attachments/20130311/2141e8f6/attachment.html>