PATCHES: R600: Implement work-around for CF stack HW bug

Wed Dec 11 10:07:40 PST 2013

Hi,

The attached patches implement a work-around for the CF stack HW bug
that is present on some Evergreen and NI GPUs.

Please Review.

-Tom
-------------- next part --------------
>From 7653598e8f3b111643348caa0aad5ff0f8859fe6 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Fri, 6 Dec 2013 16:19:40 -0800
Subject: [PATCH 1/6] R600: Add stack size to .AMDGPUcsdata section

---
 lib/Target/R600/AMDGPUAsmPrinter.cpp | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index 8aca57a..0d40364 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -89,10 +89,17 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
                               SectionKind::getReadOnly());
     OutStreamer.SwitchSection(CommentSection);
 
-    OutStreamer.EmitRawText(
-      Twine("; Kernel info:\n") +
-      "; NumSgprs: " + Twine(KernelInfo.NumSGPR) + "\n" +
-      "; NumVgprs: " + Twine(KernelInfo.NumVGPR) + "\n");
+    if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+
+      OutStreamer.EmitRawText(
+        Twine("; Kernel info:\n") +
+        "; NumSgprs: " + Twine(KernelInfo.NumSGPR) + "\n" +
+        "; NumVgprs: " + Twine(KernelInfo.NumVGPR) + "\n");
+    } else {
+      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+      OutStreamer.EmitRawText(
+        Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
+    }
   }
 
   if (STM.dumpCode()) {
-- 
1.8.1.4

-------------- next part --------------
>From ab3e7de200612b913c2c56f4902eb3671df2603d Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Thu, 5 Dec 2013 18:59:51 -0800
Subject: [PATCH 2/6] R600: Add wavefront size property to the subtargets v2

v2:
  - Initialize wavefront size to 0
---
 lib/Target/R600/AMDGPU.td           | 12 +++++++++++-
 lib/Target/R600/AMDGPUSubtarget.cpp |  5 +++++
 lib/Target/R600/AMDGPUSubtarget.h   |  2 ++
 lib/Target/R600/Processors.td       | 26 +++++++++++++++-----------
 4 files changed, 33 insertions(+), 12 deletions(-)

diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 36c4156..c4e5efc 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -72,6 +72,16 @@ class SubtargetFeatureFetchLimit <string Value> :
 def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
 def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
 
+class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
+        "wavefrontsize"#Value,
+        "WavefrontSize",
+        !cast<string>(Value),
+        "The number of threads per wavefront">;
+
+def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
+def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
+def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
+
 class SubtargetFeatureGeneration <string Value,
                                   list<SubtargetFeature> Implies> :
         SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
@@ -87,7 +97,7 @@ def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
         [FeatureFetchLimit16]>;
 
 def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
-        [FeatureFetchLimit16]>;
+        [FeatureFetchLimit16, FeatureWavefrontSize64]>;
 
 def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
         [Feature64BitPtr, FeatureFP64]>;
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index b892e7e..dc5b6ae 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -38,6 +38,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
   CaymanISA = false;
   EnableIRStructurizer = true;
   EnableIfCvt = true;
+  WavefrontSize = 0;
   ParseSubtargetFeatures(GPU, FS);
   DevName = GPU;
 }
@@ -74,6 +75,10 @@ bool
 AMDGPUSubtarget::isIfCvtEnabled() const {
   return EnableIfCvt;
 }
+unsigned
+AMDGPUSubtarget::getWavefrontSize() const {
+  return WavefrontSize;
+}
 bool
 AMDGPUSubtarget::isTargetELF() const {
   return false;
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 4288d27..7c122a8 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -51,6 +51,7 @@ private:
   bool CaymanISA;
   bool EnableIRStructurizer;
   bool EnableIfCvt;
+  unsigned WavefrontSize;
 
   InstrItineraryData InstrItins;
 
@@ -68,6 +69,7 @@ public:
   bool hasCaymanISA() const;
   bool IsIRStructurizerEnabled() const;
   bool isIfCvtEnabled() const;
+  unsigned getWavefrontSize() const;
 
   virtual bool enableMachineScheduler() const {
     return getGeneration() <= NORTHERN_ISLANDS;
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index 5499a20..e601f35 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -17,45 +17,49 @@ def : Proc<"",           R600_VLIW5_Itin,
     [FeatureR600, FeatureVertexCache]>;
 
 def : Proc<"r600",       R600_VLIW5_Itin,
-    [FeatureR600 , FeatureVertexCache]>;
+    [FeatureR600 , FeatureVertexCache, FeatureWavefrontSize64]>;
+
+def : Proc<"r630",       R600_VLIW5_Itin,
+    [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>;
 
 def : Proc<"rs880",      R600_VLIW5_Itin,
-    [FeatureR600]>;
+    [FeatureR600, FeatureWavefrontSize16]>;
 
 def : Proc<"rv670",      R600_VLIW5_Itin,
-    [FeatureR600, FeatureFP64, FeatureVertexCache]>;
+    [FeatureR600, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
 
 //===----------------------------------------------------------------------===//
 // R700
 //===----------------------------------------------------------------------===//
 
 def : Proc<"rv710",      R600_VLIW5_Itin,
-    [FeatureR700, FeatureVertexCache]>;
+    [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
 
 def : Proc<"rv730",      R600_VLIW5_Itin,
-    [FeatureR700, FeatureVertexCache]>;
+    [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
 
 def : Proc<"rv770",      R600_VLIW5_Itin,
-    [FeatureR700, FeatureFP64, FeatureVertexCache]>;
+    [FeatureR700, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
 
 //===----------------------------------------------------------------------===//
 // Evergreen
 //===----------------------------------------------------------------------===//
 
 def : Proc<"cedar",      R600_VLIW5_Itin,
-    [FeatureEvergreen, FeatureVertexCache]>;
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32]>;
 
 def : Proc<"redwood",    R600_VLIW5_Itin,
-    [FeatureEvergreen, FeatureVertexCache]>;
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>;
 
 def : Proc<"sumo",       R600_VLIW5_Itin,
-    [FeatureEvergreen]>;
+    [FeatureEvergreen, FeatureWavefrontSize64]>;
 
 def : Proc<"juniper",    R600_VLIW5_Itin,
-    [FeatureEvergreen, FeatureVertexCache]>;
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>;
 
 def : Proc<"cypress",    R600_VLIW5_Itin,
-    [FeatureEvergreen, FeatureFP64, FeatureVertexCache]>;
+    [FeatureEvergreen, FeatureFP64, FeatureVertexCache,
+     FeatureWavefrontSize64]>;
 
 //===----------------------------------------------------------------------===//
 // Northern Islands
-- 
1.8.1.4

-------------- next part --------------
>From e1f4c33348d0a4becdebed89fb7aa3ef9df8a760 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Fri, 6 Dec 2013 13:09:43 -0800
Subject: [PATCH 3/6] R600: CF_PUSH is the same on Evergreen and Cayman

---
 lib/Target/R600/R600ControlFlowFinalizer.cpp | 2 +-
 lib/Target/R600/R600Instructions.td          | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index bca243c..3876619 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -373,7 +373,7 @@ public:
           MaxStack = std::max(MaxStack, CurrentStack);
           HasPush = true;
           if (ST.hasCaymanISA() && CurrentLoopDepth > 1) {
-            BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_CM))
+            BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
                 .addImm(CfCount + 1)
                 .addImm(1);
             MI->setDesc(TII->get(AMDGPU::CF_ALU));
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index bd590e6..3f3fea6 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1793,6 +1793,10 @@ def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET",
   "JUMP @$ADDR POP:$POP_COUNT"> {
     let COUNT = 0;
   }
+  def CF_PUSH_EG : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+                                "PUSH @$ADDR POP:$POP_COUNT"> {
+    let COUNT = 0;
+  }
   def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
   "ELSE @$ADDR POP:$POP_COUNT"> {
     let COUNT = 0;
@@ -1869,9 +1873,6 @@ def : Pat <
     let COUNT = 0;
   }
 
-  def CF_PUSH_CM : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT), "PUSH @$ADDR POP:$POP_COUNT"> {
-    let COUNT = 0;
-  }
 
 def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
 
-- 
1.8.1.4

-------------- next part --------------
>From d208e2acb7a4c698ecf475d1ac17718774b94652 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Fri, 6 Dec 2013 13:10:46 -0800
Subject: [PATCH 4/6] R600: Refactor stack size calculation

---
 lib/Target/R600/AMDGPUSubtarget.cpp          |  17 +++
 lib/Target/R600/AMDGPUSubtarget.h            |   1 +
 lib/Target/R600/R600ControlFlowFinalizer.cpp | 174 +++++++++++++++++++++------
 test/CodeGen/R600/elf.r600.ll                |   2 +-
 4 files changed, 158 insertions(+), 36 deletions(-)

diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index dc5b6ae..124f7c5 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -79,6 +79,23 @@ unsigned
 AMDGPUSubtarget::getWavefrontSize() const {
   return WavefrontSize;
 }
+unsigned
+AMDGPUSubtarget::getStackEntrySize() const {
+  assert(getGeneration() <= NORTHERN_ISLANDS);
+  switch(getWavefrontSize()) {
+  case 16:
+    return 8;
+  case 32:
+    if (hasCaymanISA())
+      return 4;
+    else
+      return 8;
+  case 64:
+    return 4;
+  default:
+    llvm_unreachable("Illegal wavefront size.");
+  }
+}
 bool
 AMDGPUSubtarget::isTargetELF() const {
   return false;
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 7c122a8..789c161 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -70,6 +70,7 @@ public:
   bool IsIRStructurizerEnabled() const;
   bool isIfCvtEnabled() const;
   unsigned getWavefrontSize() const;
+  unsigned getStackEntrySize() const;
 
   virtual bool enableMachineScheduler() const {
     return getGeneration() <= NORTHERN_ISLANDS;
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index 3876619..2d5c4b4 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -28,6 +28,135 @@ using namespace llvm;
 
 namespace {
 
+struct CFStack {
+
+  enum StackItem {
+    ENTRY = 0,
+    SUB_ENTRY = 1,
+    FIRST_NON_WQM_PUSH = 2,
+    FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
+  };
+
+  const AMDGPUSubtarget &ST;
+  std::vector<StackItem> BranchStack;
+  std::vector<StackItem> LoopStack;
+  unsigned MaxStackSize;
+  unsigned CurrentEntries;
+  unsigned CurrentSubEntries;
+
+  CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st),
+      // We need to reserve a stack entry for CALL_FS in vertex shaders.
+      MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
+      CurrentEntries(0), CurrentSubEntries(0) { }
+
+  unsigned getLoopDepth();
+  bool branchStackContains(CFStack::StackItem);
+  bool requiresWorkAroundForInst(unsigned Opcode);
+  unsigned getSubEntrySize(CFStack::StackItem Item);
+  void updateMaxStackSize();
+  void pushBranch(unsigned Opcode, bool isWQM = false);
+  void pushLoop();
+  void popBranch();
+  void popLoop();
+};
+
+unsigned CFStack::getLoopDepth() {
+  return LoopStack.size();
+}
+
+bool CFStack::branchStackContains(CFStack::StackItem Item) {
+  for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
+       E = BranchStack.end(); I != E; ++I) {
+    if (*I == Item)
+      return true;
+  }
+  return false;
+}
+
+unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
+  switch(Item) {
+  default:
+    return 0;
+  case CFStack::FIRST_NON_WQM_PUSH:
+  assert(!ST.hasCaymanISA());
+  if (ST.getGeneration() <= AMDGPUSubtarget::R700) {
+    // +1 For the push operation.
+    // +2 Extra space required.
+    return 3;
+  } else {
+    // Some documentation says that this is not necessary on Evergreen,
+    // but experimentation has show that we need to allocate 1 extra
+    // sub-entry for the first non-WQM push.
+    // +1 For the push operation.
+    // +1 Extra space required.
+    return 2;
+  }
+  case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
+    assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
+    // +1 For the push operation.
+    // +1 Extra space required.
+    return 2;
+  case CFStack::SUB_ENTRY:
+    return 1;
+  }
+}
+
+void CFStack::updateMaxStackSize() {
+  unsigned CurrentStackSize = CurrentEntries +
+                              (RoundUpToAlignment(CurrentSubEntries, 4) / 4);
+  MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
+}
+
+void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
+  CFStack::StackItem Item = CFStack::ENTRY;
+  switch(Opcode) {
+  case AMDGPU::CF_PUSH_EG:
+  case AMDGPU::CF_ALU_PUSH_BEFORE:
+    if (!isWQM) {
+      if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
+        Item = CFStack::FIRST_NON_WQM_PUSH;  // May not be required on Evergreen/NI
+                                             // See comment in
+                                             // CFStack::getSubEntrySize()
+      else if (CurrentEntries > 0 &&
+               ST.getGeneration() > AMDGPUSubtarget::EVERGREEN &&
+               !ST.hasCaymanISA() &&
+               !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
+        Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
+      else
+        Item = CFStack::SUB_ENTRY;
+    } else {
+      Item = CFStack::ENTRY;
+    }
+    break;
+  }
+  BranchStack.push_back(Item);
+  if (Item == CFStack::ENTRY)
+    CurrentEntries++;
+  else
+    CurrentSubEntries += getSubEntrySize(Item);
+  updateMaxStackSize();
+}
+
+void CFStack::pushLoop() {
+  LoopStack.push_back(CFStack::ENTRY);
+  CurrentEntries++;
+  updateMaxStackSize();
+}
+
+void CFStack::popBranch() {
+  CFStack::StackItem Top = BranchStack.back();
+  if (Top == CFStack::ENTRY)
+    CurrentEntries--;
+  else
+    CurrentSubEntries-= getSubEntrySize(Top);
+  BranchStack.pop_back();
+}
+
+void CFStack::popLoop() {
+  CurrentEntries--;
+  LoopStack.pop_back();
+}
+
 class R600ControlFlowFinalizer : public MachineFunctionPass {
 
 private:
@@ -300,24 +429,6 @@ private:
     }
   }
 
-  unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const {
-    switch (ST.getGeneration()) {
-    case AMDGPUSubtarget::R600:
-    case AMDGPUSubtarget::R700:
-      if (hasPush)
-        StackSubEntry += 2;
-      break;
-    case AMDGPUSubtarget::EVERGREEN:
-      if (hasPush)
-        StackSubEntry ++;
-    case AMDGPUSubtarget::NORTHERN_ISLANDS:
-      StackSubEntry += 2;
-      break;
-    default: llvm_unreachable("Not a VLIW4/VLIW5 GPU");
-    }
-    return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4
-  }
-
 public:
   R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
     TII (0), TRI(0),
@@ -329,23 +440,19 @@ public:
   virtual bool runOnMachineFunction(MachineFunction &MF) {
     TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
     TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo());
+    R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
 
-    unsigned MaxStack = 0;
-    unsigned CurrentStack = 0;
-    unsigned CurrentLoopDepth = 0;
-    bool HasPush = false;
+    CFStack CFStack(ST, MFI->ShaderType);
     for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
         ++MB) {
       MachineBasicBlock &MBB = *MB;
       unsigned CfCount = 0;
       std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
       std::vector<MachineInstr * > IfThenElseStack;
-      R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
       if (MFI->ShaderType == 1) {
         BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
             getHWInstrDesc(CF_CALL_FS));
         CfCount++;
-        MaxStack = 1;
       }
       std::vector<ClauseFile> FetchClauses, AluClauses;
       std::vector<MachineInstr *> LastAlu(1);
@@ -369,15 +476,15 @@ public:
         I++;
         switch (MI->getOpcode()) {
         case AMDGPU::CF_ALU_PUSH_BEFORE:
-          CurrentStack++;
-          MaxStack = std::max(MaxStack, CurrentStack);
-          HasPush = true;
-          if (ST.hasCaymanISA() && CurrentLoopDepth > 1) {
+          if (ST.hasCaymanISA() && CFStack.getLoopDepth() > 1) {
             BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
                 .addImm(CfCount + 1)
                 .addImm(1);
             MI->setDesc(TII->get(AMDGPU::CF_ALU));
             CfCount++;
+            CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
+          } else {
+            CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
           }
         case AMDGPU::CF_ALU:
           I = MI;
@@ -386,9 +493,7 @@ public:
           CfCount++;
           break;
         case AMDGPU::WHILELOOP: {
-          CurrentStack+=4;
-          CurrentLoopDepth++;
-          MaxStack = std::max(MaxStack, CurrentStack);
+          CFStack.pushLoop();
           MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
               getHWInstrDesc(CF_WHILE_LOOP))
               .addImm(1);
@@ -401,8 +506,7 @@ public:
           break;
         }
         case AMDGPU::ENDLOOP: {
-          CurrentStack-=4;
-          CurrentLoopDepth--;
+          CFStack.popLoop();
           std::pair<unsigned, std::set<MachineInstr *> > Pair =
               LoopStack.back();
           LoopStack.pop_back();
@@ -440,7 +544,7 @@ public:
           break;
         }
         case AMDGPU::ENDIF: {
-          CurrentStack--;
+          CFStack.popBranch();
           if (LastAlu.back()) {
             ToPopAfter.push_back(LastAlu.back());
           } else {
@@ -515,7 +619,7 @@ public:
             .addImm(Alu->getOperand(8).getImm());
         Alu->eraseFromParent();
       }
-      MFI->StackSize = getHWStackSize(MaxStack, HasPush);
+      MFI->StackSize = CFStack.MaxStackSize;
     }
 
     return false;
diff --git a/test/CodeGen/R600/elf.r600.ll b/test/CodeGen/R600/elf.r600.ll
index 0590efb..4436c07 100644
--- a/test/CodeGen/R600/elf.r600.ll
+++ b/test/CodeGen/R600/elf.r600.ll
@@ -6,7 +6,7 @@
 
 ; CONFIG-CHECK: .section .AMDGPU.config
 ; CONFIG-CHECK-NEXT: .long   166100
-; CONFIG-CHECK-NEXT: .long   258
+; CONFIG-CHECK-NEXT: .long   2
 ; CONFIG-CHECK-NEXT: .long   165900
 ; CONFIG-CHECK-NEXT: .long   0
 define void @test(float addrspace(1)* %out, i32 %p) {
-- 
1.8.1.4

-------------- next part --------------
>From 7c1bfad133c4e0465398839923bdfad6d0e38209 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Fri, 6 Dec 2013 16:02:37 -0800
Subject: [PATCH 5/6] R600: Add some missing CF instruction definitions to the
 .td files.

---
 lib/Target/R600/R600Instructions.td | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 3f3fea6..26dfc28 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -642,6 +642,9 @@ ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG {
 def CF_ALU : ALU_CLAUSE<8, "ALU">;
 def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">;
 def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">;
+def CF_ALU_CONTINUE : ALU_CLAUSE<13, "ALU_CONTINUE">;
+def CF_ALU_BREAK : ALU_CLAUSE<14, "ALU_BREAK">;
+def CF_ALU_ELSE_AFTER : ALU_CLAUSE<15, "ALU_ELSE_AFTER">;
 
 def FETCH_CLAUSE : AMDGPUInst <(outs),
 (ins i32imm:$addr), "Fetch clause starting at $addr:", [] > {
@@ -1235,6 +1238,10 @@ let Predicates = [isR600] in {
   "JUMP @$ADDR POP:$POP_COUNT"> {
     let CNT = 0;
   }
+  def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR),
+  "PUSH_ELSE @$ADDR"> {
+    let CNT = 0;
+  }
   def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
   "ELSE @$ADDR POP:$POP_COUNT"> {
     let CNT = 0;
-- 
1.8.1.4

-------------- next part --------------
>From 59147cba6d765857e9ab2783d5f17ec6d3cda4a8 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Fri, 6 Dec 2013 16:02:37 -0800
Subject: [PATCH 6/6] R600: Add work-around for the CF stack entry HW bug

The CF stack can be corrupted if you use CF_ALU_PUSH_BEFORE,
CF_ALU_ELSE_AFTER, CF_ALU_BREAK, or CF_ALU_CONTINUE when the number of
sub-entries on the stack is greater than or equal to the stack entry
size and sub-entries modulo 4 is either 0 or 3 (on cedar the bug is
present when number of sub-entries module 8 is either 7 or 0)

We choose to be conservative and always apply the work-around when the
number of sub-enries is greater than or equal to the stack entry size,
so that we can safely over-allocate the stack when we are unsure of the
stack allocation rules.
---
 lib/Target/R600/AMDGPU.td                    |   5 +
 lib/Target/R600/AMDGPUSubtarget.cpp          |   6 +
 lib/Target/R600/AMDGPUSubtarget.h            |   2 +
 lib/Target/R600/Processors.td                |  14 +-
 lib/Target/R600/R600ControlFlowFinalizer.cpp |  44 +++++-
 test/CodeGen/R600/cf-stack-bug.ll            | 225 +++++++++++++++++++++++++++
 6 files changed, 289 insertions(+), 7 deletions(-)
 create mode 100644 test/CodeGen/R600/cf-stack-bug.ll

diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index c4e5efc..d1e2cf5 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -63,6 +63,11 @@ def FeatureCaymanISA : SubtargetFeature<"caymanISA",
         "true",
         "Use Cayman ISA">;
 
+def FeatureCFALUBug : SubtargetFeature<"cfalubug",
+        "CFALUBug",
+        "true",
+        "GPU has CF_ALU bug">;
+
 class SubtargetFeatureFetchLimit <string Value> :
                           SubtargetFeature <"fetch"#Value,
         "TexVTXClauseSize",
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index 124f7c5..1cfe550 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -39,6 +39,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
   EnableIRStructurizer = true;
   EnableIfCvt = true;
   WavefrontSize = 0;
+  CFALUBug = false;
   ParseSubtargetFeatures(GPU, FS);
   DevName = GPU;
 }
@@ -97,6 +98,11 @@ AMDGPUSubtarget::getStackEntrySize() const {
   }
 }
 bool
+AMDGPUSubtarget::hasCFAluBug() const {
+  assert(getGeneration() <= NORTHERN_ISLANDS);
+  return CFALUBug;
+}
+bool
 AMDGPUSubtarget::isTargetELF() const {
   return false;
 }
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 789c161..110d979 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -52,6 +52,7 @@ private:
   bool EnableIRStructurizer;
   bool EnableIfCvt;
   unsigned WavefrontSize;
+  bool CFALUBug;
 
   InstrItineraryData InstrItins;
 
@@ -71,6 +72,7 @@ public:
   bool isIfCvtEnabled() const;
   unsigned getWavefrontSize() const;
   unsigned getStackEntrySize() const;
+  bool hasCFAluBug() const;
 
   virtual bool enableMachineScheduler() const {
     return getGeneration() <= NORTHERN_ISLANDS;
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index e601f35..fde4481 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -46,13 +46,15 @@ def : Proc<"rv770",      R600_VLIW5_Itin,
 //===----------------------------------------------------------------------===//
 
 def : Proc<"cedar",      R600_VLIW5_Itin,
-    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32]>;
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32,
+     FeatureCFALUBug]>;
 
 def : Proc<"redwood",    R600_VLIW5_Itin,
-    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>;
+    [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64,
+     FeatureCFALUBug]>;
 
 def : Proc<"sumo",       R600_VLIW5_Itin,
-    [FeatureEvergreen, FeatureWavefrontSize64]>;
+    [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>;
 
 def : Proc<"juniper",    R600_VLIW5_Itin,
     [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>;
@@ -66,13 +68,13 @@ def : Proc<"cypress",    R600_VLIW5_Itin,
 //===----------------------------------------------------------------------===//
 
 def : Proc<"barts",      R600_VLIW5_Itin,
-    [FeatureNorthernIslands, FeatureVertexCache]>;
+    [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
 
 def : Proc<"turks",      R600_VLIW5_Itin,
-    [FeatureNorthernIslands, FeatureVertexCache]>;
+    [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
 
 def : Proc<"caicos",     R600_VLIW5_Itin,
-    [FeatureNorthernIslands]>;
+    [FeatureNorthernIslands, FeatureCFALUBug]>;
 
 def : Proc<"cayman",     R600_VLIW4_Itin,
     [FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>;
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index 2d5c4b4..353fce0 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -73,6 +73,45 @@ bool CFStack::branchStackContains(CFStack::StackItem Item) {
   return false;
 }
 
+bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
+  if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() &&
+      getLoopDepth() > 1) {
+    return true;
+  }
+
+  if (!ST.hasCFAluBug())
+    return false;
+
+  switch(Opcode) {
+  default: return false;
+  case AMDGPU::CF_ALU_PUSH_BEFORE:
+  case AMDGPU::CF_ALU_ELSE_AFTER:
+  case AMDGPU::CF_ALU_BREAK:
+  case AMDGPU::CF_ALU_CONTINUE:
+    if (CurrentSubEntries == 0)
+      return false;
+    if (ST.getWavefrontSize() == 64) {
+      // We are being conservative here.  We only require this work-around if
+      // CurrentSubEntries > 3 &&
+      // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
+      //
+      // We have to be conservative, because we don't know for certain that
+      // our stack allocation algorithm for Evergreen/NI is correct.  Applying this
+      // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
+      // resources without any problems.
+      return CurrentSubEntries > 3;
+    } else {
+      assert(ST.getWavefrontSize() == 32);
+      // We are being conservative here.  We only require the work-around if
+      // CurrentSubEntries > 7 &&
+      // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
+      // See the comment on the wavefront size == 64 case for why we are
+      // being conservative.
+      return CurrentSubEntries > 7;
+    }
+  }
+}
+
 unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
   switch(Item) {
   default:
@@ -474,9 +513,12 @@ public:
         if (MI->getOpcode() == AMDGPU::CF_ALU)
           LastAlu.back() = MI;
         I++;
+        bool RequiresWorkAround =
+            CFStack.requiresWorkAroundForInst(MI->getOpcode());
         switch (MI->getOpcode()) {
         case AMDGPU::CF_ALU_PUSH_BEFORE:
-          if (ST.hasCaymanISA() && CFStack.getLoopDepth() > 1) {
+          if (RequiresWorkAround) {
+            DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
             BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
                 .addImm(CfCount + 1)
                 .addImm(1);
diff --git a/test/CodeGen/R600/cf-stack-bug.ll b/test/CodeGen/R600/cf-stack-bug.ll
new file mode 100644
index 0000000..7fa07b1
--- /dev/null
+++ b/test/CodeGen/R600/cf-stack-bug.ll
@@ -0,0 +1,225 @@
+; RUN: llc -march=r600 -mcpu=redwood -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=sumo -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=barts -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=turks -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=caicos -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=cedar -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG32 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=juniper -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=cypress -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=cayman -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC
+
+; We are currently allocating 2 extra sub-entries on Evergreen / NI for
+; non-WQM push instructions if we change this to 1, then we will need to
+; add one level of depth to each of these tests.
+
+; BUG64-NOT: Applying bug work-around
+; BUG32-NOT: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: @nested3
+define void @nested3(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp sgt i32 %cond, 0
+  br i1 %0, label %if.1, label %end
+
+if.1:
+  %1 = icmp sgt i32 %cond, 10
+  br i1 %1, label %if.2, label %if.store.1
+
+if.store.1:
+  store i32 1, i32 addrspace(1)* %out
+  br label %end
+
+if.2:
+  %2 = icmp sgt i32 %cond, 20
+  br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+  store i32 2, i32 addrspace(1)* %out
+  br label %end
+
+if.3:
+  store i32 3, i32 addrspace(1)* %out
+  br label %end
+
+end:
+  ret void
+}
+
+; BUG64: Applying bug work-around
+; BUG32-NOT: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: @nested4
+define void @nested4(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp sgt i32 %cond, 0
+  br i1 %0, label %if.1, label %end
+
+if.1:
+  %1 = icmp sgt i32 %cond, 10
+  br i1 %1, label %if.2, label %if.1.store
+
+if.1.store:
+  store i32 1, i32 addrspace(1)* %out
+  br label %end
+
+if.2:
+  %2 = icmp sgt i32 %cond, 20
+  br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+  store i32 2, i32 addrspace(1)* %out
+  br label %end
+
+if.3:
+  %3 = icmp sgt i32 %cond, 30
+  br i1 %3, label %if.4, label %if.3.store
+
+if.3.store:
+  store i32 3, i32 addrspace(1)* %out
+  br label %end
+
+if.4:
+  store i32 4, i32 addrspace(1)* %out
+  br label %end
+
+end:
+  ret void
+}
+
+; BUG64: Applying bug work-around
+; BUG32-NOT: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: @nested7
+define void @nested7(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp sgt i32 %cond, 0
+  br i1 %0, label %if.1, label %end
+
+if.1:
+  %1 = icmp sgt i32 %cond, 10
+  br i1 %1, label %if.2, label %if.1.store
+
+if.1.store:
+  store i32 1, i32 addrspace(1)* %out
+  br label %end
+
+if.2:
+  %2 = icmp sgt i32 %cond, 20
+  br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+  store i32 2, i32 addrspace(1)* %out
+  br label %end
+
+if.3:
+  %3 = icmp sgt i32 %cond, 30
+  br i1 %3, label %if.4, label %if.3.store
+
+if.3.store:
+  store i32 3, i32 addrspace(1)* %out
+  br label %end
+
+if.4:
+  %4 = icmp sgt i32 %cond, 40
+  br i1 %4, label %if.5, label %if.4.store
+
+if.4.store:
+  store i32 4, i32 addrspace(1)* %out
+  br label %end
+
+if.5:
+  %5 = icmp sgt i32 %cond, 50
+  br i1 %5, label %if.6, label %if.5.store
+
+if.5.store:
+  store i32 5, i32 addrspace(1)* %out
+  br label %end
+
+if.6:
+  %6 = icmp sgt i32 %cond, 60
+  br i1 %6, label %if.7, label %if.6.store
+
+if.6.store:
+  store i32 6, i32 addrspace(1)* %out
+  br label %end
+
+if.7:
+  store i32 7, i32 addrspace(1)* %out
+  br label %end
+
+end:
+  ret void
+}
+
+; BUG64: Applying bug work-around
+; BUG32: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: @nested8
+define void @nested8(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+  %0 = icmp sgt i32 %cond, 0
+  br i1 %0, label %if.1, label %end
+
+if.1:
+  %1 = icmp sgt i32 %cond, 10
+  br i1 %1, label %if.2, label %if.1.store
+
+if.1.store:
+  store i32 1, i32 addrspace(1)* %out
+  br label %end
+
+if.2:
+  %2 = icmp sgt i32 %cond, 20
+  br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+  store i32 2, i32 addrspace(1)* %out
+  br label %end
+
+if.3:
+  %3 = icmp sgt i32 %cond, 30
+  br i1 %3, label %if.4, label %if.3.store
+
+if.3.store:
+  store i32 3, i32 addrspace(1)* %out
+  br label %end
+
+if.4:
+  %4 = icmp sgt i32 %cond, 40
+  br i1 %4, label %if.5, label %if.4.store
+
+if.4.store:
+  store i32 4, i32 addrspace(1)* %out
+  br label %end
+
+if.5:
+  %5 = icmp sgt i32 %cond, 50
+  br i1 %5, label %if.6, label %if.5.store
+
+if.5.store:
+  store i32 5, i32 addrspace(1)* %out
+  br label %end
+
+if.6:
+  %6 = icmp sgt i32 %cond, 60
+  br i1 %6, label %if.7, label %if.6.store
+
+if.6.store:
+  store i32 6, i32 addrspace(1)* %out
+  br label %end
+
+if.7:
+  %7 = icmp sgt i32 %cond, 70
+  br i1 %7, label %if.8, label %if.7.store
+
+if.7.store:
+  store i32 7, i32 addrspace(1)* %out
+  br label %end
+
+if.8:
+  store i32 8, i32 addrspace(1)* %out
+  br label %end
+
+end:
+  ret void
+}
-- 
1.8.1.4