[Libclc-dev] [PATCH 2/4] mem_fence() support for R600 targets

Tom Stellard tom at stellard.net
Fri Aug 22 10:01:42 PDT 2014


On Fri, Aug 22, 2014 at 02:13:30AM +0200, Hilloulin Damien wrote:
>  This patch introduces three new intrinsics and therefore
>  must be used in conjunction with the patches to the LLVM backend.
> It adds the
>  mem_fence built-in function, and an implementation for r600 using
> the newly
>  introduced LLVM intrinsics. (It is mainly copy-paste from
> barrier_impl.ll but
>  if flags equals 0, we do nothing).
> 
> Signed-off-by: Damien Hilloulin <damien.hilloulin at supelec.fr>
> ---
>  generic/include/clc/clc.h                          |  3 ++
>  .../include/clc/explicitmemoryfence/mem_fence.h    |  1 +
>  r600/lib/SOURCES                                   |  1 +
>  r600/lib/explicitmemoryfence/mem_fence_impl.ll     | 44
> ++++++++++++++++++++++
>  4 files changed, 49 insertions(+)
>  create mode 100644 generic/include/clc/explicitmemoryfence/mem_fence.h
>  create mode 100644 r600/lib/explicitmemoryfence/mem_fence_impl.ll
> 
> diff --git a/generic/include/clc/clc.h b/generic/include/clc/clc.h
> index 84b25ac..13d9c11 100644
> --- a/generic/include/clc/clc.h
> +++ b/generic/include/clc/clc.h
> @@ -126,6 +126,9 @@
>  #include <clc/synchronization/cl_mem_fence_flags.h>
>  #include <clc/synchronization/barrier.h>
> 
> +/* 6.11.9 Explicit memory fence Functions */
> +#include <clc/explicitmemoryfence/mem_fence.h>
> +
>  /* 6.11.10 Async Copy and Prefetch Functions */
>  #include <clc/async/prefetch.h>
> 

I would really like to see a generic implementation of this
which use barrier().  

Would you be able to split this patch into two parts.  The first
part adding the header and the generic implementation and then
in the second part add the r600 implementation.

> diff --git a/generic/include/clc/explicitmemoryfence/mem_fence.h
> b/generic/include/clc/explicitmemoryfence/mem_fence.h
> new file mode 100644
> index 0000000..15f4b39
> --- /dev/null
> +++ b/generic/include/clc/explicitmemoryfence/mem_fence.h
> @@ -0,0 +1 @@
> +_CLC_DECL void mem_fence(cl_mem_fence_flags flags);
> diff --git a/r600/lib/SOURCES b/r600/lib/SOURCES
> index ba56605..1a38b76 100644
> --- a/r600/lib/SOURCES
> +++ b/r600/lib/SOURCES
> @@ -1,4 +1,5 @@
>  atomic/atomic.cl
> +explicitmemoryfence/mem_fence_impl.ll
>  math/nextafter.cl
>  workitem/get_num_groups.ll
>  workitem/get_group_id.ll
> diff --git a/r600/lib/explicitmemoryfence/mem_fence_impl.ll
> b/r600/lib/explicitmemoryfence/mem_fence_impl.ll
> new file mode 100644
> index 0000000..b04be89
> --- /dev/null
> +++ b/r600/lib/explicitmemoryfence/mem_fence_impl.ll
> @@ -0,0 +1,44 @@
> +declare i32 @__clc_clk_local_mem_fence() nounwind alwaysinline
> +declare i32 @__clc_clk_global_mem_fence() nounwind alwaysinline
> +declare void @llvm.AMDGPU.mem_fence.local() nounwind noduplicate
> +declare void @llvm.AMDGPU.mem_fence.global() nounwind noduplicate
> +declare void @llvm.AMDGPU.mem_fence.localglobal() nounwind noduplicate
> +

Same comment as the other patch.  I don't think we need a special localglobal
intrinsic.  The backend should be responsible for merging them if allowed.

-Tom

> +define void @mem_fence(i32 %flags) nounwind noduplicate alwaysinline {
> +
> +;flags_masking:
> +  %CLK_LOCAL_MEM_FENCE = call i32 @__clc_clk_local_mem_fence()
> +  %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
> +  %CLK_LOCAL_GLOBAL_MEM_FENCE = or i32 %CLK_LOCAL_MEM_FENCE,
> %CLK_GLOBAL_MEM_FENCE
> +  %FLAGS_MEM_FENCE_LOCAL_MASKED = and i32 %flags, %CLK_LOCAL_MEM_FENCE
> +  %FLAGS_MEM_FENCE_GLOBAL_MASKED = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
> +
> +;mem_fence_local_and_global_test:
> +  %MEM_FENCE_LOCAL_AND_GLOBAL_TEST_RESULT = icmp eq i32 %flags,
> %CLK_LOCAL_GLOBAL_MEM_FENCE
> +  br i1 %MEM_FENCE_LOCAL_AND_GLOBAL_TEST_RESULT, label
> %mem_fence_local_and_global, label %mem_fence_local_test
> +
> +mem_fence_local_and_global:
> +  call void @llvm.AMDGPU.mem_fence.localglobal() noduplicate
> +  br label %done
> +
> +mem_fence_local_test:
> +  %MEM_FENCE_LOCAL_TEST_RESULT = icmp ne i32
> %FLAGS_MEM_FENCE_LOCAL_MASKED, 0
> +  br i1 %MEM_FENCE_LOCAL_TEST_RESULT, label %mem_fence_local, label
> %mem_fence_global_test
> +
> +mem_fence_local:
> +  call void @llvm.AMDGPU.mem_fence.local() noduplicate
> +  br label %done
> +
> +mem_fence_global_test:
> +  %MEM_FENCE_GLOBAL_TEST_RESULT = icmp ne i32
> %FLAGS_MEM_FENCE_GLOBAL_MASKED, 0
> +  br i1 %MEM_FENCE_GLOBAL_TEST_RESULT, label %mem_fence_global, label %done
> +
> +mem_fence_global:
> +  call void @llvm.AMDGPU.mem_fence.global() noduplicate
> +  br label %done
> +
> +; nothing to do if flags equals 0
> +
> +done:
> +  ret void
> +}
> -- 
> 1.9.1
> 
> 
> _______________________________________________
> Libclc-dev mailing list
> Libclc-dev at pcc.me.uk
> http://www.pcc.me.uk/cgi-bin/mailman/listinfo/libclc-dev




More information about the Libclc-dev mailing list