[Libclc-dev] [PATCH 1/4] Fix and improvements to barrier() for R600 targets

Hilloulin Damien damien.hilloulin at supelec.fr
Thu Aug 21 17:13:24 PDT 2014


  This patch introduces two new intrinsics and therefore
  must be used in conjunction with the patches to the LLVM backend. It 
fixes the
  behaviour of barrier(0) : previously no barrier was generated, this is 
fixed
  by making a call to the new intrinsic barrier.nofence(). The patch also
  changes the behaviour of barrier( CLK_LOCAL_MEM_FENCE | 
CLK_GLOBAL_MEM_FENCE
  ) : previously two barriers were generated, now just one with the new
  intrinsic barrier.localglobal().

Signed-off-by: Damien Hilloulin <damien.hilloulin at supelec.fr>
---
  r600/lib/synchronization/barrier_impl.ll | 37 
++++++++++++++++++++++++--------
  1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/r600/lib/synchronization/barrier_impl.ll 
b/r600/lib/synchronization/barrier_impl.ll
index 3d8ee66..3ef6341 100644
--- a/r600/lib/synchronization/barrier_impl.ll
+++ b/r600/lib/synchronization/barrier_impl.ll
@@ -1,29 +1,48 @@
  declare i32 @__clc_clk_local_mem_fence() nounwind alwaysinline
  declare i32 @__clc_clk_global_mem_fence() nounwind alwaysinline
+declare void @llvm.AMDGPU.barrier.nofence() nounwind noduplicate
  declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
  declare void @llvm.AMDGPU.barrier.global() nounwind noduplicate
+declare void @llvm.AMDGPU.barrier.localglobal() nounwind noduplicate

  define void @barrier(i32 %flags) nounwind noduplicate alwaysinline {
-barrier_local_test:
+
+;flags_masking:
    %CLK_LOCAL_MEM_FENCE = call i32 @__clc_clk_local_mem_fence()
-  %0 = and i32 %flags, %CLK_LOCAL_MEM_FENCE
-  %1 = icmp ne i32 %0, 0
-  br i1 %1, label %barrier_local, label %barrier_global_test
+  %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
+  %CLK_LOCAL_GLOBAL_MEM_FENCE = or i32 %CLK_LOCAL_MEM_FENCE, 
%CLK_GLOBAL_MEM_FENCE
+  %FLAGS_BARRIER_LOCAL_MASKED = and i32 %flags, %CLK_LOCAL_MEM_FENCE
+  %FLAGS_BARRIER_GLOBAL_MASKED = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
+
+;barrier_local_and_global_test:
+  %BARRIER_LOCAL_AND_GLOBAL_TEST_RESULT = icmp eq i32 %flags, 
%CLK_LOCAL_GLOBAL_MEM_FENCE
+  br i1 %BARRIER_LOCAL_AND_GLOBAL_TEST_RESULT, label 
%barrier_local_and_global, label %barrier_local_test
+
+barrier_local_and_global:
+  call void @llvm.AMDGPU.barrier.localglobal() noduplicate
+  br label %done
+
+barrier_local_test:
+  %BARRIER_LOCAL_TEST_RESULT = icmp ne i32 %FLAGS_BARRIER_LOCAL_MASKED, 0
+  br i1 %BARRIER_LOCAL_TEST_RESULT, label %barrier_local, label 
%barrier_global_test

  barrier_local:
    call void @llvm.AMDGPU.barrier.local() noduplicate
-  br label %barrier_global_test
+  br label %done

  barrier_global_test:
-  %CLK_GLOBAL_MEM_FENCE = call i32 @__clc_clk_global_mem_fence()
-  %2 = and i32 %flags, %CLK_GLOBAL_MEM_FENCE
-  %3 = icmp ne i32 %2, 0
-  br i1 %3, label %barrier_global, label %done
+  %BARRIER_GLOBAL_TEST_RESULT = icmp ne i32 %FLAGS_BARRIER_GLOBAL_MASKED, 0
+  br i1 %BARRIER_GLOBAL_TEST_RESULT, label %barrier_global, label 
%barrier_nofence

  barrier_global:
    call void @llvm.AMDGPU.barrier.global() noduplicate
    br label %done

+; default case: no memory fence queued
+barrier_nofence:
+  call void @llvm.AMDGPU.barrier.nofence() noduplicate
+  br label %done
+
  done:
    ret void
  }
-- 
1.9.1





More information about the Libclc-dev mailing list