[PATCH] R600: Make sure OQAP defs and uses happen in the same clause

Fri Oct 11 11:10:15 PDT 2013

From: Tom Stellard <thomas.stellard at amd.com>

Reading the special OQAP register pops the top value off the LDS
input queue and returns it to the instruction.  This queue is
invalidated at the end of an ALU clause and leaving values in the queue
can lead to GPU hangs.  This means that if we load a value into the queue,
we must use it before the end of the clause.

This fixes some hangs in the OpenCV test suite.
---
 lib/Target/R600/R600MachineScheduler.cpp | 25 +++++++++++++------------
 lib/Target/R600/R600MachineScheduler.h   |  4 ++--
 test/CodeGen/R600/lds-input-queue.ll     | 26 ++++++++++++++++++++++++++
 3 files changed, 41 insertions(+), 14 deletions(-)
 create mode 100644 test/CodeGen/R600/lds-input-queue.ll

diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
index 6c26d9e..611b7f4 100644
--- a/lib/Target/R600/R600MachineScheduler.cpp
+++ b/lib/Target/R600/R600MachineScheduler.cpp
@@ -93,11 +93,12 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
   }
 
 
-  // We want to scheduled AR defs as soon as possible to make sure they aren't
-  // put in a different ALU clause from their uses.
-  if (!SU && !UnscheduledARDefs.empty()) {
-      SU = UnscheduledARDefs[0];
-      UnscheduledARDefs.erase(UnscheduledARDefs.begin());
+  // We want to scheduled defs that cannot be live outside of this clause 
+  // as soon as possible to make sure they aren't put in a different
+  // ALU clause from their uses.
+  if (!SU && !UnscheduledNoLiveOutDefs.empty()) {
+      SU = UnscheduledNoLiveOutDefs[0];
+      UnscheduledNoLiveOutDefs.erase(UnscheduledNoLiveOutDefs.begin());
       NextInstKind = IDAlu;
   }
 
@@ -132,9 +133,9 @@ SUnit* R600SchedStrategy::pickNode(bool &IsTopNode) {
 
   // We want to schedule the AR uses as late as possible to make sure that
   // the AR defs have been released.
-  if (!SU && !UnscheduledARUses.empty()) {
-      SU = UnscheduledARUses[0];
-      UnscheduledARUses.erase(UnscheduledARUses.begin());
+  if (!SU && !UnscheduledNoLiveOutUses.empty()) {
+      SU = UnscheduledNoLiveOutUses[0];
+      UnscheduledNoLiveOutUses.erase(UnscheduledNoLiveOutUses.begin());
       NextInstKind = IDAlu;
   }
 
@@ -217,15 +218,15 @@ void R600SchedStrategy::releaseBottomNode(SUnit *SU) {
 
   int IK = getInstKind(SU);
 
-  // Check for AR register defines
+  // Check for registers that do not live across ALU clauses.
   for (MachineInstr::const_mop_iterator I = SU->getInstr()->operands_begin(),
                                         E = SU->getInstr()->operands_end();
                                         I != E; ++I) {
-    if (I->isReg() && I->getReg() == AMDGPU::AR_X) {
+    if (I->isReg() && (I->getReg() == AMDGPU::AR_X || I->getReg() == AMDGPU::OQAP)) {
       if (I->isDef()) {
-        UnscheduledARDefs.push_back(SU);
+        UnscheduledNoLiveOutDefs.push_back(SU);
       } else {
-        UnscheduledARUses.push_back(SU);
+        UnscheduledNoLiveOutUses.push_back(SU);
       }
       return;
     }
diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
index 0a6f120..db2e188 100644
--- a/lib/Target/R600/R600MachineScheduler.h
+++ b/lib/Target/R600/R600MachineScheduler.h
@@ -53,8 +53,8 @@ class R600SchedStrategy : public MachineSchedStrategy {
 
   std::vector<SUnit *> Available[IDLast], Pending[IDLast];
   std::vector<SUnit *> AvailableAlus[AluLast];
-  std::vector<SUnit *> UnscheduledARDefs;
-  std::vector<SUnit *> UnscheduledARUses;
+  std::vector<SUnit *> UnscheduledNoLiveOutDefs;
+  std::vector<SUnit *> UnscheduledNoLiveOutUses;
   std::vector<SUnit *> PhysicalRegCopy;
 
   InstKind CurInstKind;
diff --git a/test/CodeGen/R600/lds-input-queue.ll b/test/CodeGen/R600/lds-input-queue.ll
new file mode 100644
index 0000000..548b41c
--- /dev/null
+++ b/test/CodeGen/R600/lds-input-queue.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+;
+; This test checks that the lds input queue will is empty at the end of
+; the ALU clause.
+
+; CHECK-LABEL: @lds_input_queue
+; CHECK: LDS_READ_RET * OQAP
+; CHECK-NOT: ALU clause
+; CHECK: MOV T{{[0-9]\.[XYZW]}}, OQAP
+
+ at local_mem = internal addrspace(3) unnamed_addr global [2 x i32] [i32 1, i32 2], align 4
+
+define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
+entry:
+  %0 = getelementptr inbounds [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
+  %1 = load i32 addrspace(3)* %0
+  call void @llvm.AMDGPU.barrier.local()
+
+  ; This will start a new clause for the vertex fetch
+  %2 = load i32 addrspace(1)* %in
+  %3 = add i32 %1, %2
+  store i32 %3, i32 addrspace(1)* %out
+  ret void
+}
+
+declare void @llvm.AMDGPU.barrier.local()
-- 
1.7.11.4