[llvm] r335645 - LoopUnroll: Allow analyzing intrinsic call costs

Tue Jun 26 11:51:17 PDT 2018

Author: arsenm
Date: Tue Jun 26 11:51:17 2018
New Revision: 335645

URL: http://llvm.org/viewvc/llvm-project?rev=335645&view=rev
Log:
LoopUnroll: Allow analyzing intrinsic call costs

I'm not sure why the code here is skipping calls since
TTI does try to do something for general calls, but it
at least should allow intrinsics.

Skip intrinsics that should not be omitted as calls, which
is by far the most common case on AMDGPU.

Added:
    llvm/trunk/test/Transforms/LoopUnroll/AMDGPU/unroll-cost-call.ll
Modified:
    llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp

Modified: llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp?rev=335645&r1=335644&r2=335645&view=diff
==============================================================================

--- llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp (original)
+++ llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp Tue Jun 26 11:51:17 2018
@@ -515,8 +515,13 @@ static Optional<EstimatedUnrollCost> ana
 
         // Can't properly model a cost of a call.
         // FIXME: With a proper cost model we should be able to do it.
-        if(isa<CallInst>(&I))
-          return None;
+        if (auto *CI = dyn_cast<CallInst>(&I)) {
+          const Function *Callee = CI->getCalledFunction();
+          if (!Callee || TTI.isLoweredToCall(Callee)) {
+            LLVM_DEBUG(dbgs() << "Can't analyze cost of loop with call\n");
+            return None;
+          }
+        }
 
         // If the instruction might have a side-effect recursively account for
         // the cost of it and all the instructions leading up to it.

Added: llvm/trunk/test/Transforms/LoopUnroll/AMDGPU/unroll-cost-call.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/Transforms/LoopUnroll/AMDGPU/unroll-cost-call.ll?rev=335645&view=auto
==============================================================================
--- llvm/trunk/test/Transforms/LoopUnroll/AMDGPU/unroll-cost-call.ll (added)
+++ llvm/trunk/test/Transforms/LoopUnroll/AMDGPU/unroll-cost-call.ll Tue Jun 26 11:51:17 2018
@@ -0,0 +1,77 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=hawaii -loop-unroll -unroll-threshold=100 -unroll-peel-count=0 -unroll-allow-partial=false -unroll-max-iteration-count-to-analyze=16 < %s | FileCheck %s
+
+; CHECK-LABEL: @test_intrinsic_call_cost(
+; CHECK-NOT: br i1
+define amdgpu_kernel void @test_intrinsic_call_cost(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture %in) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.02 = phi float [ %fmul, %for.body ], [ 0.0, %entry ]
+  %arrayidx.in = getelementptr inbounds float, float addrspace(1)* %in, i32 %indvars.iv
+  %arrayidx.out = getelementptr inbounds float, float addrspace(1)* %out, i32 %indvars.iv
+  %load = load float, float addrspace(1)* %arrayidx.in
+  %call = call float @llvm.minnum.f32(float %load, float 1.0);
+  %fmul = fmul float %call, %sum.02
+  store float %fmul, float addrspace(1)* %arrayidx.out
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: @test_func_call_cost(
+; CHECK: br i1 %exitcond
+define amdgpu_kernel void @test_func_call_cost(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture %in) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.02 = phi float [ %fmul, %for.body ], [ 0.0, %entry ]
+  %arrayidx.in = getelementptr inbounds float, float addrspace(1)* %in, i32 %indvars.iv
+  %arrayidx.out = getelementptr inbounds float, float addrspace(1)* %out, i32 %indvars.iv
+  %load = load float, float addrspace(1)* %arrayidx.in
+  %fptr = load float(float, float)*, float(float, float )* addrspace(4)* null
+  %call = tail call float %fptr(float %load, float 1.0)
+  %fmul = fmul float %call, %sum.02
+  store float %fmul, float addrspace(1)* %arrayidx.out
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: @test_indirect_call_cost(
+; CHECK: br i1 %exitcond
+define amdgpu_kernel void @test_indirect_call_cost(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture %in) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i32 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %sum.02 = phi float [ %fmul, %for.body ], [ 0.0, %entry ]
+  %arrayidx.in = getelementptr inbounds float, float addrspace(1)* %in, i32 %indvars.iv
+  %arrayidx.out = getelementptr inbounds float, float addrspace(1)* %out, i32 %indvars.iv
+  %load = load float, float addrspace(1)* %arrayidx.in
+  %min = call float @func(float %load, float 1.0);
+  %fmul = fmul float %min, %sum.02
+  store float %fmul, float addrspace(1)* %arrayidx.out
+  %indvars.iv.next = add i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, 16
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+declare float @llvm.minnum.f32(float, float) #1
+declare float @func(float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone speculatable }