[PATCH] D19203: AMDGPU/SI: Add llvm.amdgcn.s.waitcnt.all intrinsic

Sun Apr 17 15:20:29 PDT 2016

nhaehnle created this revision.
nhaehnle added reviewers: arsenm, mareko, tstellarAMD.
nhaehnle added a subscriber: llvm-commits.
Herald added a subscriber: arsenm.

So it appears that to guarantee some of the ordering requirements of a GLSL
memoryBarrier() executed in the shader, we need to emit an s_waitcnt.

(We can't use an s_barrier, because memoryBarrier() may appear anywhere in
the shader, in particular it may appear in non-uniform control flow.)

http://reviews.llvm.org/D19203

Files:
  include/llvm/IR/IntrinsicsAMDGPU.td
  lib/Target/AMDGPU/SIInsertWaits.cpp
  lib/Target/AMDGPU/SIInstructions.td
  test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll

Index: test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
===================================================================

--- /dev/null
+++ test/CodeGen/AMDGPU/llvm.amdgcn.s.waitcnt.ll
@@ -0,0 +1,20 @@
+; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+
+; CHECK-LABEL: {{^}}test1:
+; CHECK: image_store
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0){{$}}
+; CHECK-NEXT: image_store
+; CHECK-NEXT: s_endpgm
+define amdgpu_ps void @test1(<8 x i32> inreg %rsrc, <4 x float> %d0, <4 x float> %d1, i32 %c0, i32 %c1) {
+  call void @llvm.amdgcn.image.store.i32(<4 x float> %d0, i32 %c0, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0)
+  call void @llvm.amdgcn.s.waitcnt.all()
+  call void @llvm.amdgcn.image.store.i32(<4 x float> %d1, i32 %c1, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 1, i1 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.s.waitcnt.all() #0
+
+declare void @llvm.amdgcn.image.store.i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
+
+attributes #0 = { nounwind }
Index: lib/Target/AMDGPU/SIInstructions.td
===================================================================
--- lib/Target/AMDGPU/SIInstructions.td
+++ lib/Target/AMDGPU/SIInstructions.td
@@ -503,6 +503,7 @@
   let isConvergent = 1;
 }
 
+let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
 def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
 def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
 
@@ -2460,6 +2461,11 @@
 // SOPP Patterns
 //===----------------------------------------------------------------------===//
 
+def : Pat <
+  (int_amdgcn_s_waitcnt_all),
+  (S_WAITCNT)
+>;
+
 // FIXME: These should be removed eventually
 def : Pat <
   (int_AMDGPU_barrier_global),
Index: lib/Target/AMDGPU/SIInsertWaits.cpp
===================================================================
--- lib/Target/AMDGPU/SIInsertWaits.cpp
+++ lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -552,6 +552,8 @@
   memset(&UsedRegs, 0, sizeof(UsedRegs));
   memset(&DefinedRegs, 0, sizeof(DefinedRegs));
 
+  SmallVector<MachineInstr *, 4> RemoveMI;
+
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; ++BI) {
 
@@ -610,7 +612,11 @@
       // Wait for everything before a barrier.
       if (I->getOpcode() == AMDGPU::S_BARRIER)
         Changes |= insertWait(MBB, I, LastIssued);
-      else
+      else if (I->getOpcode() == AMDGPU::S_WAITCNT) {
+        RemoveMI.push_back(I);
+        insertWait(MBB, I, LastIssued);
+        Changes = true;
+      } else
         Changes |= insertWait(MBB, I, handleOperands(*I));
 
       pushInstruction(MBB, I);
@@ -621,5 +627,8 @@
     Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
   }
 
+  for (MachineInstr *I : RemoveMI)
+    I->eraseFromParent();
+
   return Changes;
 }
Index: include/llvm/IR/IntrinsicsAMDGPU.td
===================================================================
--- include/llvm/IR/IntrinsicsAMDGPU.td
+++ include/llvm/IR/IntrinsicsAMDGPU.td
@@ -68,6 +68,8 @@
 def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">,
   Intrinsic<[], [], [IntrConvergent]>;
 
+def int_amdgcn_s_waitcnt_all : Intrinsic<[], [], [IntrConvergent]>;
+
 def int_amdgcn_div_scale : Intrinsic<
   // 1st parameter: Numerator
   // 2nd parameter: Denominator


-------------- next part --------------
A non-text attachment was scrubbed...
Name: D19203.54014.patch
Type: text/x-patch
Size: 3431 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20160417/90358efc/attachment.bin>