[Libclc-dev] [PATCH 1/1] r600: Add fence implementation, rework barrier
Tom Stellard
tom at stellard.net
Tue Apr 29 16:42:22 PDT 2014
On Sun, Apr 27, 2014 at 02:26:36PM -0400, Jan Vesely wrote:
> Hw flushes caches after each mem clause (Ch 8.2 EG manual),
> so we need to pass the information that indicates requested
> clause boundaries.
> Use existing llvm fence instruction for that.
>
> Signed-off-by: Jan Vesely <jan.vesely at rutgers.edu>
> ---
>
> I'm not sure whether using 'acquire' ordering for reads and 'release'
> for writes is the correct thing with respect to OpenCL specs.
> It seems to get read-write-relation correct, but I', not sure about
> read ordering, or write ordering. Making everything use the strongest mem_fence
> might be safer.
>
It looks like the LLVM memory orderings are designed for c++11
memory ordering:
http://en.cppreference.com/w/cpp/atomic/memory_order
The OpenCL spec is really unclear as to what mem_fence means,
so I think we may have to use seq_cst, but I think you should
pose this question to the cfe-dev mailing list to see what others
do.
Other than that, the patch looks good to me.
> This patch changes global-memory.cl piglit crash from:
> i32 = GlobalAddress<void ()* @llvm.AMDGPU.barrier.global> 0 [ORD=11]
> Undefined function
> to:
> LLVM ERROR: Cannot select: 0x24b9f58: i32,ch = load 0x2090108, 0x24b9718, 0x24b9c40<LD1[@barrier_test_local_int.test(addrspace=3)], zext from i1> [ORD=4] [ID=13]
> 0x24b9718: i32 = Constant<0> [ID=7]
> 0x24b9c40: i32 = undef [ID=2]
> In function: barrier_test_local_int
>
> Without the fences the piglit just fails.
>
>
> r600/lib/SOURCES | 1 +
> r600/lib/synchronization/barrier_impl.ll | 25 ++----------------
> r600/lib/synchronization/fence_impl.ll | 44 ++++++++++++++++++++++++++++++++
> 3 files changed, 47 insertions(+), 23 deletions(-)
> create mode 100644 r600/lib/synchronization/fence_impl.ll
>
> diff --git a/r600/lib/SOURCES b/r600/lib/SOURCES
> index d9fc897..d93ec0f 100644
> --- a/r600/lib/SOURCES
> +++ b/r600/lib/SOURCES
> @@ -7,4 +7,5 @@ workitem/get_local_id.ll
> workitem/get_global_size.ll
> synchronization/barrier.cl
> synchronization/barrier_impl.ll
> +synchronization/fence_impl.ll
> shared/vload.cl
> diff --git a/r600/lib/synchronization/barrier_impl.ll b/r600/lib/synchronization/barrier_impl.ll
> index 3d8ee66..2f2d865 100644
> --- a/r600/lib/synchronization/barrier_impl.ll
> +++ b/r600/lib/synchronization/barrier_impl.ll
> @@ -1,29 +1,8 @@
> -declare i32 @__clc_clk_local_mem_fence() nounwind alwaysinline
> -declare i32 @__clc_clk_global_mem_fence() nounwind alwaysinline
> +declare void @mem_fence(i32 %flags) nounwind alwaysinline
> declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
> -declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate
>
> define void @barrier(i32 %flags) nounwind noduplicate alwaysinline {
> -barrier_local_test:
> - %CLK_LOCAL_MEM_FENCE = call i32 @__clc_clk_local_mem_fence()
> - %0 = and i32 %flags, %CLK_LOCAL_MEM_FENCE
> - %1 = icmp ne i32 %0, 0
> - br i1 %1, label %barrier_local, label %barrier_global_test
> -
> -barrier_local:
> + call void @mem_fence(i32 %flags)
> call void @llvm.AMDGPU.barrier.local() noduplicate
> - br label %barrier_global_test
> -
> -barrier_global_test:
> - %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
> - %2 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
> - %3 = icmp ne i32 %2, 0
> - br i1 %3, label %barrier_global, label %done
> -
> -barrier_global:
> - call void @llvm.AMDGPU.barrier.global() noduplicate
> - br label %done
> -
> -done:
> ret void
> }
> diff --git a/r600/lib/synchronization/fence_impl.ll b/r600/lib/synchronization/fence_impl.ll
> new file mode 100644
> index 0000000..a72e70e
> --- /dev/null
> +++ b/r600/lib/synchronization/fence_impl.ll
> @@ -0,0 +1,44 @@
> +declare i32 @__clc_clk_local_mem_fence() nounwind alwaysinline
> +declare i32 @__clc_clk_global_mem_fence() nounwind alwaysinline
> +
> +define void @mem_fence(i32 %flags) nounwind noduplicate alwaysinline {
> + %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
> + %1 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
> + %2 = icmp ne i32 %1, 0
> + br i1 %2, label %global_fence, label %exit_fence
> +
> +global_fence:
> + fence seq_cst
> + br label %exit_fence
> +
> +exit_fence:
> + ret void
> +}
> +
> +define void @read_mem_fence(i32 %flags) nounwind noduplicate alwaysinline {
> + %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
> + %1 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
> + %2 = icmp ne i32 %1, 0
> + br i1 %2, label %global_fence, label %exit_fence
> +
> +global_fence:
> + fence acquire
> + br label %exit_fence
> +
> +exit_fence:
> + ret void
> +}
> +
> +define void @write_mem_fence(i32 %flags) nounwind noduplicate alwaysinline {
> + %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
> + %1 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
> + %2 = icmp ne i32 %1, 0
> + br i1 %2, label %global_fence, label %exit_fence
> +
> +global_fence:
> + fence release
> + br label %exit_fence
> +
> +exit_fence:
> + ret void
> +}
> --
> 1.9.0
>
>
> _______________________________________________
> Libclc-dev mailing list
> Libclc-dev at pcc.me.uk
> http://www.pcc.me.uk/cgi-bin/mailman/listinfo/libclc-dev
More information about the Libclc-dev
mailing list