[Libclc-dev] [PATCH 1/4] Fix and improvements to barrier() for R600 targets
Hilloulin Damien
damien.hilloulin at supelec.fr
Thu Aug 21 17:13:24 PDT 2014
This patch introduces two new intrinsics and therefore
must be used in conjunction with the patches to the LLVM backend. It
fixes the
behaviour of barrier(0) : previously no barrier was generated, this is
fixed
by making a call to the new intrinsic barrier.nofence(). The patch also
changes the behaviour of barrier( CLK_LOCAL_MEM_FENCE |
CLK_GLOBAL_MEM_FENCE
) : previously two barriers were generated, now just one with the new
intrinsic barrier.localglobal().
Signed-off-by: Damien Hilloulin <damien.hilloulin at supelec.fr>
---
r600/lib/synchronization/barrier_impl.ll | 37
++++++++++++++++++++++++--------
1 file changed, 28 insertions(+), 9 deletions(-)
diff --git a/r600/lib/synchronization/barrier_impl.ll
b/r600/lib/synchronization/barrier_impl.ll
index 3d8ee66..3ef6341 100644
--- a/r600/lib/synchronization/barrier_impl.ll
+++ b/r600/lib/synchronization/barrier_impl.ll
@@ -1,29 +1,48 @@
declare i32 @__clc_clk_local_mem_fence() nounwind alwaysinline
declare i32 @__clc_clk_global_mem_fence() nounwind alwaysinline
+declare void @llvm.AMDGPU.barrier.nofence() nounwind noduplicate
declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate
+declare void @llvm.AMDGPU.barrier.localglobal() nounwind noduplicate
define void @barrier(i32 %flags) nounwind noduplicate alwaysinline {
-barrier_local_test:
+
+;flags_masking:
%CLK_LOCAL_MEM_FENCE = call i32 @__clc_clk_local_mem_fence()
- %0 = and i32 %flags, %CLK_LOCAL_MEM_FENCE
- %1 = icmp ne i32 %0, 0
- br i1 %1, label %barrier_local, label %barrier_global_test
+ %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
+ %CLK_LOCAL_GLOBAL_MEM_FENCE = or i32 %CLK_LOCAL_MEM_FENCE,
%CLK_GLOBAL_MEM_FENCE
+ %FLAGS_BARRIER_LOCAL_MASKED = and i32 %flags, %CLK_LOCAL_MEM_FENCE
+ %FLAGS_BARRIER_GLOBAL_MASKED = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
+
+;barrier_local_and_global_test:
+ %BARRIER_LOCAL_AND_GLOBAL_TEST_RESULT = icmp eq i32 %flags,
%CLK_LOCAL_GLOBAL_MEM_FENCE
+ br i1 %BARRIER_LOCAL_AND_GLOBAL_TEST_RESULT, label
%barrier_local_and_global, label %barrier_local_test
+
+barrier_local_and_global:
+ call void @llvm.AMDGPU.barrier.localglobal() noduplicate
+ br label %done
+
+barrier_local_test:
+ %BARRIER_LOCAL_TEST_RESULT = icmp ne i32 %FLAGS_BARRIER_LOCAL_MASKED, 0
+ br i1 %BARRIER_LOCAL_TEST_RESULT, label %barrier_local, label
%barrier_global_test
barrier_local:
call void @llvm.AMDGPU.barrier.local() noduplicate
- br label %barrier_global_test
+ br label %done
barrier_global_test:
- %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
- %2 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
- %3 = icmp ne i32 %2, 0
- br i1 %3, label %barrier_global, label %done
+ %BARRIER_GLOBAL_TEST_RESULT = icmp ne i32 %FLAGS_BARRIER_GLOBAL_MASKED, 0
+ br i1 %BARRIER_GLOBAL_TEST_RESULT, label %barrier_global, label
%barrier_nofence
barrier_global:
call void @llvm.AMDGPU.barrier.global() noduplicate
br label %done
+; default case: no memory fence queued
+barrier_nofence:
+ call void @llvm.AMDGPU.barrier.nofence() noduplicate
+ br label %done
+
done:
ret void
}
--
1.9.1
More information about the Libclc-dev
mailing list