[Libclc-dev] [PATCH 1/1] r600: Add fence implementation, rework barrier

Aaron Watry awatry at gmail.com
Sun Apr 27 14:31:08 PDT 2014


Hi Jan,

Just a quick note.  I've been working on an alternative implementation
of this (reworking barriers/fences, and getting global barriers to not
crash on radeonsi), and I plan on reviewing your change...  Just need
to find time to do it.

--Aaron

On Sun, Apr 27, 2014 at 1:26 PM, Jan Vesely <jan.vesely at rutgers.edu> wrote:
> Hw flushes caches after each mem clause (Ch 8.2 EG manual),
> so we need to pass the information that indicates requested
> clause boundaries.
> Use existing llvm fence instruction for that.
>
> Signed-off-by: Jan Vesely <jan.vesely at rutgers.edu>
> ---
>
> I'm not sure whether using 'acquire' ordering for reads and 'release'
> for writes is the correct thing with respect to OpenCL specs.
> It seems to get read-write-relation correct, but I', not sure about
> read ordering, or write ordering. Making everything use the strongest mem_fence
> might be safer.
>
> This patch changes global-memory.cl piglit crash from:
> i32 = GlobalAddress<void ()* @llvm.AMDGPU.barrier.global> 0 [ORD=11]
> Undefined function
> to:
> LLVM ERROR: Cannot select: 0x24b9f58: i32,ch = load 0x2090108, 0x24b9718, 0x24b9c40<LD1[@barrier_test_local_int.test(addrspace=3)], zext from i1> [ORD=4] [ID=13]
>   0x24b9718: i32 = Constant<0> [ID=7]
>   0x24b9c40: i32 = undef [ID=2]
> In function: barrier_test_local_int
>
> Without the fences the piglit just fails.
>
>
>  r600/lib/SOURCES                         |  1 +
>  r600/lib/synchronization/barrier_impl.ll | 25 ++----------------
>  r600/lib/synchronization/fence_impl.ll   | 44 ++++++++++++++++++++++++++++++++
>  3 files changed, 47 insertions(+), 23 deletions(-)
>  create mode 100644 r600/lib/synchronization/fence_impl.ll
>
> diff --git a/r600/lib/SOURCES b/r600/lib/SOURCES
> index d9fc897..d93ec0f 100644
> --- a/r600/lib/SOURCES
> +++ b/r600/lib/SOURCES
> @@ -7,4 +7,5 @@ workitem/get_local_id.ll
>  workitem/get_global_size.ll
>  synchronization/barrier.cl
>  synchronization/barrier_impl.ll
> +synchronization/fence_impl.ll
>  shared/vload.cl
> diff --git a/r600/lib/synchronization/barrier_impl.ll b/r600/lib/synchronization/barrier_impl.ll
> index 3d8ee66..2f2d865 100644
> --- a/r600/lib/synchronization/barrier_impl.ll
> +++ b/r600/lib/synchronization/barrier_impl.ll
> @@ -1,29 +1,8 @@
> -declare i32 @__clc_clk_local_mem_fence() nounwind alwaysinline
> -declare i32 @__clc_clk_global_mem_fence() nounwind alwaysinline
> +declare void @mem_fence(i32 %flags) nounwind alwaysinline
>  declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
> -declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate
>
>  define void @barrier(i32 %flags) nounwind noduplicate alwaysinline {
> -barrier_local_test:
> -  %CLK_LOCAL_MEM_FENCE = call i32 @__clc_clk_local_mem_fence()
> -  %0 = and i32 %flags, %CLK_LOCAL_MEM_FENCE
> -  %1 = icmp ne i32 %0, 0
> -  br i1 %1, label %barrier_local, label %barrier_global_test
> -
> -barrier_local:
> +  call void @mem_fence(i32 %flags)
>    call void @llvm.AMDGPU.barrier.local() noduplicate
> -  br label %barrier_global_test
> -
> -barrier_global_test:
> -  %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
> -  %2 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
> -  %3 = icmp ne i32 %2, 0
> -  br i1 %3, label %barrier_global, label %done
> -
> -barrier_global:
> -  call void @llvm.AMDGPU.barrier.global() noduplicate
> -  br label %done
> -
> -done:
>    ret void
>  }
> diff --git a/r600/lib/synchronization/fence_impl.ll b/r600/lib/synchronization/fence_impl.ll
> new file mode 100644
> index 0000000..a72e70e
> --- /dev/null
> +++ b/r600/lib/synchronization/fence_impl.ll
> @@ -0,0 +1,44 @@
> +declare i32 @__clc_clk_local_mem_fence() nounwind alwaysinline
> +declare i32 @__clc_clk_global_mem_fence() nounwind alwaysinline
> +
> +define void @mem_fence(i32 %flags) nounwind noduplicate alwaysinline {
> +  %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
> +  %1 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
> +  %2 = icmp ne i32 %1, 0
> +  br i1 %2, label %global_fence, label %exit_fence
> +
> +global_fence:
> +  fence seq_cst
> +  br label %exit_fence
> +
> +exit_fence:
> +  ret void
> +}
> +
> +define void @read_mem_fence(i32 %flags) nounwind noduplicate alwaysinline {
> +  %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
> +  %1 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
> +  %2 = icmp ne i32 %1, 0
> +  br i1 %2, label %global_fence, label %exit_fence
> +
> +global_fence:
> +  fence acquire
> +  br label %exit_fence
> +
> +exit_fence:
> +  ret void
> +}
> +
> +define void @write_mem_fence(i32 %flags) nounwind noduplicate alwaysinline {
> +  %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
> +  %1 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
> +  %2 = icmp ne i32 %1, 0
> +  br i1 %2, label %global_fence, label %exit_fence
> +
> +global_fence:
> +  fence release
> +  br label %exit_fence
> +
> +exit_fence:
> +  ret void
> +}
> --
> 1.9.0
>
>
> _______________________________________________
> Libclc-dev mailing list
> Libclc-dev at pcc.me.uk
> http://www.pcc.me.uk/cgi-bin/mailman/listinfo/libclc-dev




More information about the Libclc-dev mailing list