[Libclc-dev] [PATCH 1/1] r600: Add fence implementation, rework barrier

Jan Vesely jan.vesely at rutgers.edu
Sun Apr 27 11:26:36 PDT 2014


Hw flushes caches after each mem clause (Ch 8.2 EG manual),
so we need to pass the information that indicates requested
clause boundaries.
Use existing llvm fence instruction for that.

Signed-off-by: Jan Vesely <jan.vesely at rutgers.edu>
---

I'm not sure whether using 'acquire' ordering for reads and 'release'
for writes is the correct thing with respect to OpenCL specs.
It seems to get read-write-relation correct, but I', not sure about
read ordering, or write ordering. Making everything use the strongest mem_fence
might be safer.

This patch changes global-memory.cl piglit crash from:
i32 = GlobalAddress<void ()* @llvm.AMDGPU.barrier.global> 0 [ORD=11]
Undefined function
to:
LLVM ERROR: Cannot select: 0x24b9f58: i32,ch = load 0x2090108, 0x24b9718, 0x24b9c40<LD1[@barrier_test_local_int.test(addrspace=3)], zext from i1> [ORD=4] [ID=13]
  0x24b9718: i32 = Constant<0> [ID=7]
  0x24b9c40: i32 = undef [ID=2]
In function: barrier_test_local_int

Without the fences the piglit just fails.


 r600/lib/SOURCES                         |  1 +
 r600/lib/synchronization/barrier_impl.ll | 25 ++----------------
 r600/lib/synchronization/fence_impl.ll   | 44 ++++++++++++++++++++++++++++++++
 3 files changed, 47 insertions(+), 23 deletions(-)
 create mode 100644 r600/lib/synchronization/fence_impl.ll

diff --git a/r600/lib/SOURCES b/r600/lib/SOURCES
index d9fc897..d93ec0f 100644
--- a/r600/lib/SOURCES
+++ b/r600/lib/SOURCES
@@ -7,4 +7,5 @@ workitem/get_local_id.ll
 workitem/get_global_size.ll
 synchronization/barrier.cl
 synchronization/barrier_impl.ll
+synchronization/fence_impl.ll
 shared/vload.cl
diff --git a/r600/lib/synchronization/barrier_impl.ll b/r600/lib/synchronization/barrier_impl.ll
index 3d8ee66..2f2d865 100644
--- a/r600/lib/synchronization/barrier_impl.ll
+++ b/r600/lib/synchronization/barrier_impl.ll
@@ -1,29 +1,8 @@
-declare i32 @__clc_clk_local_mem_fence() nounwind alwaysinline
-declare i32 @__clc_clk_global_mem_fence() nounwind alwaysinline
+declare void @mem_fence(i32 %flags) nounwind alwaysinline
 declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
-declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate
 
 define void @barrier(i32 %flags) nounwind noduplicate alwaysinline {
-barrier_local_test:
-  %CLK_LOCAL_MEM_FENCE = call i32 @__clc_clk_local_mem_fence()
-  %0 = and i32 %flags, %CLK_LOCAL_MEM_FENCE
-  %1 = icmp ne i32 %0, 0
-  br i1 %1, label %barrier_local, label %barrier_global_test
-
-barrier_local:
+  call void @mem_fence(i32 %flags)
   call void @llvm.AMDGPU.barrier.local() noduplicate
-  br label %barrier_global_test
-
-barrier_global_test:
-  %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
-  %2 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
-  %3 = icmp ne i32 %2, 0
-  br i1 %3, label %barrier_global, label %done
-
-barrier_global:
-  call void @llvm.AMDGPU.barrier.global() noduplicate
-  br label %done
-
-done:
   ret void
 }
diff --git a/r600/lib/synchronization/fence_impl.ll b/r600/lib/synchronization/fence_impl.ll
new file mode 100644
index 0000000..a72e70e
--- /dev/null
+++ b/r600/lib/synchronization/fence_impl.ll
@@ -0,0 +1,44 @@
+declare i32 @__clc_clk_local_mem_fence() nounwind alwaysinline
+declare i32 @__clc_clk_global_mem_fence() nounwind alwaysinline
+
+define void @mem_fence(i32 %flags) nounwind noduplicate alwaysinline {
+  %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
+  %1 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
+  %2 = icmp ne i32 %1, 0
+  br i1 %2, label %global_fence, label %exit_fence
+
+global_fence:
+  fence seq_cst
+  br label %exit_fence
+
+exit_fence:
+  ret void
+}
+
+define void @read_mem_fence(i32 %flags) nounwind noduplicate alwaysinline {
+  %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
+  %1 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
+  %2 = icmp ne i32 %1, 0
+  br i1 %2, label %global_fence, label %exit_fence
+
+global_fence:
+  fence acquire
+  br label %exit_fence
+
+exit_fence:
+  ret void
+}
+
+define void @write_mem_fence(i32 %flags) nounwind noduplicate alwaysinline {
+  %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
+  %1 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
+  %2 = icmp ne i32 %1, 0
+  br i1 %2, label %global_fence, label %exit_fence
+
+global_fence:
+  fence release
+  br label %exit_fence
+
+exit_fence:
+  ret void
+}
-- 
1.9.0





More information about the Libclc-dev mailing list