No subject

Tue Jun 18 14:47:27 PDT 2013

[quote]
alwaysinline
    This attribute indicates that the inliner should attempt to inline
this function into callers whenever possible, ignoring any active
inlining size threshold for this
    caller.

...snip...

noduplicate

This attribute indicates that calls to the function cannot be
duplicated. A call to a noduplicate function may be moved within its
parent function, but may not be duplicated within its parent function.

A function containing a noduplicate call may still be an inlining
candidate, provided that the call is not duplicated by inlining. That
implies that the function has internal linkage and only has one call
site, so the original call is dead after inlining.
[/quote]

If that isn't an option, then there's also inlinehint...   I'm not
NAK'ing this patch, just asking for clarification (if you happen to
know the answer).

Also, is there an upcoming patch to implement the intrinsic for
llvm.AMDGPU.barrier.global?  I scanned recent llvm-commits posts, but
if there's one there, I must've missed it.  If there's nothing yet,
let me know and I'll take a crack at it.... I had started looking at
the necessary plumbing for this a while ago, but I hadn't written any
code except a few tests.

--Aaron

On Fri, Oct 11, 2013 at 12:35 PM, Tom Stellard <tom at stellard.net> wrote:
> From: Tom Stellard <thomas.stellard at amd.com>
>
> This will prevent LLVM optimization passes from creating illegal uses
> of the barrier() intrinsic (e.g. calling barrier() from a conditional
> that is not executed by all threads).
> ---
>  r600/lib/SOURCES                         |  1 -
>  r600/lib/synchronization/barrier.cl      | 15 +++++----------
>  r600/lib/synchronization/barrier_impl.ll | 33 ++++++++++++++++++++++++--------
>  3 files changed, 30 insertions(+), 19 deletions(-)
>
> diff --git a/r600/lib/SOURCES b/r600/lib/SOURCES
> index aac6d8f..d9fc897 100644
> --- a/r600/lib/SOURCES
> +++ b/r600/lib/SOURCES
> @@ -8,4 +8,3 @@ workitem/get_global_size.ll
>  synchronization/barrier.cl
>  synchronization/barrier_impl.ll
>  shared/vload.cl
> -shared/vstore.cl
> \ No newline at end of file
> diff --git a/r600/lib/synchronization/barrier.cl b/r600/lib/synchronization/barrier.cl
> index ac0b4b3..6f2900b 100644
> --- a/r600/lib/synchronization/barrier.cl
> +++ b/r600/lib/synchronization/barrier.cl
> @@ -1,15 +1,10 @@
>
>  #include <clc/clc.h>
>
> -void barrier_local(void);
> -void barrier_global(void);
> -
> -void barrier(cl_mem_fence_flags flags) {
> -  if (flags & CLK_LOCAL_MEM_FENCE) {
> -    barrier_local();
> -  }
> +_CLC_DEF int __clc_clk_local_mem_fence() {
> +  return CLK_LOCAL_MEM_FENCE;
> +}
>
> -  if (flags & CLK_GLOBAL_MEM_FENCE) {
> -    barrier_global();
> -  }
> +_CLC_DEF int __clc_clk_global_mem_fence() {
> +  return CLK_GLOBAL_MEM_FENCE;
>  }
> diff --git a/r600/lib/synchronization/barrier_impl.ll b/r600/lib/synchronization/barrier_impl.ll
> index 99ac018..3d8ee66 100644
> --- a/r600/lib/synchronization/barrier_impl.ll
> +++ b/r600/lib/synchronization/barrier_impl.ll
> @@ -1,12 +1,29 @@
> -declare void @llvm.AMDGPU.barrier.local() nounwind
> -declare void @llvm.AMDGPU.barrier.global() nounwind
> +declare i32 @__clc_clk_local_mem_fence() nounwind alwaysinline
> +declare i32 @__clc_clk_global_mem_fence() nounwind alwaysinline
> +declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
> +declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate
>
> -define void @barrier_local() nounwind alwaysinline {
> -  call void @llvm.AMDGPU.barrier.local()
> -  ret void
> -}
> +define void @barrier(i32 %flags) nounwind noduplicate alwaysinline {
> +barrier_local_test:
> +  %CLK_LOCAL_MEM_FENCE = call i32 @__clc_clk_local_mem_fence()
> +  %0 = and i32 %flags, %CLK_LOCAL_MEM_FENCE
> +  %1 = icmp ne i32 %0, 0
> +  br i1 %1, label %barrier_local, label %barrier_global_test
> +
> +barrier_local:
> +  call void @llvm.AMDGPU.barrier.local() noduplicate
> +  br label %barrier_global_test
> +
> +barrier_global_test:
> +  %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
> +  %2 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
> +  %3 = icmp ne i32 %2, 0
> +  br i1 %3, label %barrier_global, label %done
> +
> +barrier_global:
> +  call void @llvm.AMDGPU.barrier.global() noduplicate
> +  br label %done
>
> -define void @barrier_global() nounwind alwaysinline {
> -  call void @llvm.AMDGPU.barrier.global()
> +done:
>    ret void
>  }
> --
> 1.7.11.4
>
>
> _______________________________________________
> Libclc-dev mailing list
> Libclc-dev at pcc.me.uk
> http://www.pcc.me.uk/cgi-bin/mailman/listinfo/libclc-dev