PATCHES: R600: Implement work-around for CF stack HW bug
Tom Stellard
tom at stellard.net
Wed Dec 11 10:07:40 PST 2013
Hi,
The attached patches implement a work-around for the CF stack HW bug
that is present on some Evergreen and NI GPUs.
Please Review.
-Tom
-------------- next part --------------
>From 7653598e8f3b111643348caa0aad5ff0f8859fe6 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Fri, 6 Dec 2013 16:19:40 -0800
Subject: [PATCH 1/6] R600: Add stack size to .AMDGPUcsdata section
---
lib/Target/R600/AMDGPUAsmPrinter.cpp | 15 +++++++++++----
1 file changed, 11 insertions(+), 4 deletions(-)
diff --git a/lib/Target/R600/AMDGPUAsmPrinter.cpp b/lib/Target/R600/AMDGPUAsmPrinter.cpp
index 8aca57a..0d40364 100644
--- a/lib/Target/R600/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/R600/AMDGPUAsmPrinter.cpp
@@ -89,10 +89,17 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
SectionKind::getReadOnly());
OutStreamer.SwitchSection(CommentSection);
- OutStreamer.EmitRawText(
- Twine("; Kernel info:\n") +
- "; NumSgprs: " + Twine(KernelInfo.NumSGPR) + "\n" +
- "; NumVgprs: " + Twine(KernelInfo.NumVGPR) + "\n");
+ if (STM.getGeneration() > AMDGPUSubtarget::NORTHERN_ISLANDS) {
+
+ OutStreamer.EmitRawText(
+ Twine("; Kernel info:\n") +
+ "; NumSgprs: " + Twine(KernelInfo.NumSGPR) + "\n" +
+ "; NumVgprs: " + Twine(KernelInfo.NumVGPR) + "\n");
+ } else {
+ R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
+ OutStreamer.EmitRawText(
+ Twine("SQ_PGM_RESOURCES:STACK_SIZE = " + Twine(MFI->StackSize)));
+ }
}
if (STM.dumpCode()) {
--
1.8.1.4
-------------- next part --------------
>From ab3e7de200612b913c2c56f4902eb3671df2603d Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Thu, 5 Dec 2013 18:59:51 -0800
Subject: [PATCH 2/6] R600: Add wavefront size property to the subtargets v2
v2:
- Initialize wavefront size to 0
---
lib/Target/R600/AMDGPU.td | 12 +++++++++++-
lib/Target/R600/AMDGPUSubtarget.cpp | 5 +++++
lib/Target/R600/AMDGPUSubtarget.h | 2 ++
lib/Target/R600/Processors.td | 26 +++++++++++++++-----------
4 files changed, 33 insertions(+), 12 deletions(-)
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index 36c4156..c4e5efc 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -72,6 +72,16 @@ class SubtargetFeatureFetchLimit <string Value> :
def FeatureFetchLimit8 : SubtargetFeatureFetchLimit <"8">;
def FeatureFetchLimit16 : SubtargetFeatureFetchLimit <"16">;
+class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
+ "wavefrontsize"#Value,
+ "WavefrontSize",
+ !cast<string>(Value),
+ "The number of threads per wavefront">;
+
+def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
+def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
+def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
+
class SubtargetFeatureGeneration <string Value,
list<SubtargetFeature> Implies> :
SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
@@ -87,7 +97,7 @@ def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
[FeatureFetchLimit16]>;
def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
- [FeatureFetchLimit16]>;
+ [FeatureFetchLimit16, FeatureWavefrontSize64]>;
def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
[Feature64BitPtr, FeatureFP64]>;
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index b892e7e..dc5b6ae 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -38,6 +38,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
CaymanISA = false;
EnableIRStructurizer = true;
EnableIfCvt = true;
+ WavefrontSize = 0;
ParseSubtargetFeatures(GPU, FS);
DevName = GPU;
}
@@ -74,6 +75,10 @@ bool
AMDGPUSubtarget::isIfCvtEnabled() const {
return EnableIfCvt;
}
+unsigned
+AMDGPUSubtarget::getWavefrontSize() const {
+ return WavefrontSize;
+}
bool
AMDGPUSubtarget::isTargetELF() const {
return false;
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 4288d27..7c122a8 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -51,6 +51,7 @@ private:
bool CaymanISA;
bool EnableIRStructurizer;
bool EnableIfCvt;
+ unsigned WavefrontSize;
InstrItineraryData InstrItins;
@@ -68,6 +69,7 @@ public:
bool hasCaymanISA() const;
bool IsIRStructurizerEnabled() const;
bool isIfCvtEnabled() const;
+ unsigned getWavefrontSize() const;
virtual bool enableMachineScheduler() const {
return getGeneration() <= NORTHERN_ISLANDS;
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index 5499a20..e601f35 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -17,45 +17,49 @@ def : Proc<"", R600_VLIW5_Itin,
[FeatureR600, FeatureVertexCache]>;
def : Proc<"r600", R600_VLIW5_Itin,
- [FeatureR600 , FeatureVertexCache]>;
+ [FeatureR600 , FeatureVertexCache, FeatureWavefrontSize64]>;
+
+def : Proc<"r630", R600_VLIW5_Itin,
+ [FeatureR600, FeatureVertexCache, FeatureWavefrontSize32]>;
def : Proc<"rs880", R600_VLIW5_Itin,
- [FeatureR600]>;
+ [FeatureR600, FeatureWavefrontSize16]>;
def : Proc<"rv670", R600_VLIW5_Itin,
- [FeatureR600, FeatureFP64, FeatureVertexCache]>;
+ [FeatureR600, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
//===----------------------------------------------------------------------===//
// R700
//===----------------------------------------------------------------------===//
def : Proc<"rv710", R600_VLIW5_Itin,
- [FeatureR700, FeatureVertexCache]>;
+ [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
def : Proc<"rv730", R600_VLIW5_Itin,
- [FeatureR700, FeatureVertexCache]>;
+ [FeatureR700, FeatureVertexCache, FeatureWavefrontSize32]>;
def : Proc<"rv770", R600_VLIW5_Itin,
- [FeatureR700, FeatureFP64, FeatureVertexCache]>;
+ [FeatureR700, FeatureFP64, FeatureVertexCache, FeatureWavefrontSize64]>;
//===----------------------------------------------------------------------===//
// Evergreen
//===----------------------------------------------------------------------===//
def : Proc<"cedar", R600_VLIW5_Itin,
- [FeatureEvergreen, FeatureVertexCache]>;
+ [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32]>;
def : Proc<"redwood", R600_VLIW5_Itin,
- [FeatureEvergreen, FeatureVertexCache]>;
+ [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>;
def : Proc<"sumo", R600_VLIW5_Itin,
- [FeatureEvergreen]>;
+ [FeatureEvergreen, FeatureWavefrontSize64]>;
def : Proc<"juniper", R600_VLIW5_Itin,
- [FeatureEvergreen, FeatureVertexCache]>;
+ [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>;
def : Proc<"cypress", R600_VLIW5_Itin,
- [FeatureEvergreen, FeatureFP64, FeatureVertexCache]>;
+ [FeatureEvergreen, FeatureFP64, FeatureVertexCache,
+ FeatureWavefrontSize64]>;
//===----------------------------------------------------------------------===//
// Northern Islands
--
1.8.1.4
-------------- next part --------------
>From e1f4c33348d0a4becdebed89fb7aa3ef9df8a760 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Fri, 6 Dec 2013 13:09:43 -0800
Subject: [PATCH 3/6] R600: CF_PUSH is the same on Evergreen and Cayman
---
lib/Target/R600/R600ControlFlowFinalizer.cpp | 2 +-
lib/Target/R600/R600Instructions.td | 7 ++++---
2 files changed, 5 insertions(+), 4 deletions(-)
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index bca243c..3876619 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -373,7 +373,7 @@ public:
MaxStack = std::max(MaxStack, CurrentStack);
HasPush = true;
if (ST.hasCaymanISA() && CurrentLoopDepth > 1) {
- BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_CM))
+ BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
.addImm(CfCount + 1)
.addImm(1);
MI->setDesc(TII->get(AMDGPU::CF_ALU));
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index bd590e6..3f3fea6 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -1793,6 +1793,10 @@ def LDS_USHORT_READ_RET : R600_LDS_1A <0x39, "LDS_USHORT_READ_RET",
"JUMP @$ADDR POP:$POP_COUNT"> {
let COUNT = 0;
}
+ def CF_PUSH_EG : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
+ "PUSH @$ADDR POP:$POP_COUNT"> {
+ let COUNT = 0;
+ }
def CF_ELSE_EG : CF_CLAUSE_EG<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
"ELSE @$ADDR POP:$POP_COUNT"> {
let COUNT = 0;
@@ -1869,9 +1873,6 @@ def : Pat <
let COUNT = 0;
}
- def CF_PUSH_CM : CF_CLAUSE_EG<11, (ins i32imm:$ADDR, i32imm:$POP_COUNT), "PUSH @$ADDR POP:$POP_COUNT"> {
- let COUNT = 0;
- }
def : Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
--
1.8.1.4
-------------- next part --------------
>From d208e2acb7a4c698ecf475d1ac17718774b94652 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Fri, 6 Dec 2013 13:10:46 -0800
Subject: [PATCH 4/6] R600: Refactor stack size calculation
---
lib/Target/R600/AMDGPUSubtarget.cpp | 17 +++
lib/Target/R600/AMDGPUSubtarget.h | 1 +
lib/Target/R600/R600ControlFlowFinalizer.cpp | 174 +++++++++++++++++++++------
test/CodeGen/R600/elf.r600.ll | 2 +-
4 files changed, 158 insertions(+), 36 deletions(-)
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index dc5b6ae..124f7c5 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -79,6 +79,23 @@ unsigned
AMDGPUSubtarget::getWavefrontSize() const {
return WavefrontSize;
}
+unsigned
+AMDGPUSubtarget::getStackEntrySize() const {
+ assert(getGeneration() <= NORTHERN_ISLANDS);
+ switch(getWavefrontSize()) {
+ case 16:
+ return 8;
+ case 32:
+ if (hasCaymanISA())
+ return 4;
+ else
+ return 8;
+ case 64:
+ return 4;
+ default:
+ llvm_unreachable("Illegal wavefront size.");
+ }
+}
bool
AMDGPUSubtarget::isTargetELF() const {
return false;
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 7c122a8..789c161 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -70,6 +70,7 @@ public:
bool IsIRStructurizerEnabled() const;
bool isIfCvtEnabled() const;
unsigned getWavefrontSize() const;
+ unsigned getStackEntrySize() const;
virtual bool enableMachineScheduler() const {
return getGeneration() <= NORTHERN_ISLANDS;
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index 3876619..2d5c4b4 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -28,6 +28,135 @@ using namespace llvm;
namespace {
+struct CFStack {
+
+ enum StackItem {
+ ENTRY = 0,
+ SUB_ENTRY = 1,
+ FIRST_NON_WQM_PUSH = 2,
+ FIRST_NON_WQM_PUSH_W_FULL_ENTRY = 3
+ };
+
+ const AMDGPUSubtarget &ST;
+ std::vector<StackItem> BranchStack;
+ std::vector<StackItem> LoopStack;
+ unsigned MaxStackSize;
+ unsigned CurrentEntries;
+ unsigned CurrentSubEntries;
+
+ CFStack(const AMDGPUSubtarget &st, unsigned ShaderType) : ST(st),
+ // We need to reserve a stack entry for CALL_FS in vertex shaders.
+ MaxStackSize(ShaderType == ShaderType::VERTEX ? 1 : 0),
+ CurrentEntries(0), CurrentSubEntries(0) { }
+
+ unsigned getLoopDepth();
+ bool branchStackContains(CFStack::StackItem);
+ bool requiresWorkAroundForInst(unsigned Opcode);
+ unsigned getSubEntrySize(CFStack::StackItem Item);
+ void updateMaxStackSize();
+ void pushBranch(unsigned Opcode, bool isWQM = false);
+ void pushLoop();
+ void popBranch();
+ void popLoop();
+};
+
+unsigned CFStack::getLoopDepth() {
+ return LoopStack.size();
+}
+
+bool CFStack::branchStackContains(CFStack::StackItem Item) {
+ for (std::vector<CFStack::StackItem>::const_iterator I = BranchStack.begin(),
+ E = BranchStack.end(); I != E; ++I) {
+ if (*I == Item)
+ return true;
+ }
+ return false;
+}
+
+unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
+ switch(Item) {
+ default:
+ return 0;
+ case CFStack::FIRST_NON_WQM_PUSH:
+ assert(!ST.hasCaymanISA());
+ if (ST.getGeneration() <= AMDGPUSubtarget::R700) {
+ // +1 For the push operation.
+ // +2 Extra space required.
+ return 3;
+ } else {
+ // Some documentation says that this is not necessary on Evergreen,
+ // but experimentation has show that we need to allocate 1 extra
+ // sub-entry for the first non-WQM push.
+ // +1 For the push operation.
+ // +1 Extra space required.
+ return 2;
+ }
+ case CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY:
+ assert(ST.getGeneration() >= AMDGPUSubtarget::EVERGREEN);
+ // +1 For the push operation.
+ // +1 Extra space required.
+ return 2;
+ case CFStack::SUB_ENTRY:
+ return 1;
+ }
+}
+
+void CFStack::updateMaxStackSize() {
+ unsigned CurrentStackSize = CurrentEntries +
+ (RoundUpToAlignment(CurrentSubEntries, 4) / 4);
+ MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
+}
+
+void CFStack::pushBranch(unsigned Opcode, bool isWQM) {
+ CFStack::StackItem Item = CFStack::ENTRY;
+ switch(Opcode) {
+ case AMDGPU::CF_PUSH_EG:
+ case AMDGPU::CF_ALU_PUSH_BEFORE:
+ if (!isWQM) {
+ if (!ST.hasCaymanISA() && !branchStackContains(CFStack::FIRST_NON_WQM_PUSH))
+ Item = CFStack::FIRST_NON_WQM_PUSH; // May not be required on Evergreen/NI
+ // See comment in
+ // CFStack::getSubEntrySize()
+ else if (CurrentEntries > 0 &&
+ ST.getGeneration() > AMDGPUSubtarget::EVERGREEN &&
+ !ST.hasCaymanISA() &&
+ !branchStackContains(CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY))
+ Item = CFStack::FIRST_NON_WQM_PUSH_W_FULL_ENTRY;
+ else
+ Item = CFStack::SUB_ENTRY;
+ } else {
+ Item = CFStack::ENTRY;
+ }
+ break;
+ }
+ BranchStack.push_back(Item);
+ if (Item == CFStack::ENTRY)
+ CurrentEntries++;
+ else
+ CurrentSubEntries += getSubEntrySize(Item);
+ updateMaxStackSize();
+}
+
+void CFStack::pushLoop() {
+ LoopStack.push_back(CFStack::ENTRY);
+ CurrentEntries++;
+ updateMaxStackSize();
+}
+
+void CFStack::popBranch() {
+ CFStack::StackItem Top = BranchStack.back();
+ if (Top == CFStack::ENTRY)
+ CurrentEntries--;
+ else
+ CurrentSubEntries-= getSubEntrySize(Top);
+ BranchStack.pop_back();
+}
+
+void CFStack::popLoop() {
+ CurrentEntries--;
+ LoopStack.pop_back();
+}
+
class R600ControlFlowFinalizer : public MachineFunctionPass {
private:
@@ -300,24 +429,6 @@ private:
}
}
- unsigned getHWStackSize(unsigned StackSubEntry, bool hasPush) const {
- switch (ST.getGeneration()) {
- case AMDGPUSubtarget::R600:
- case AMDGPUSubtarget::R700:
- if (hasPush)
- StackSubEntry += 2;
- break;
- case AMDGPUSubtarget::EVERGREEN:
- if (hasPush)
- StackSubEntry ++;
- case AMDGPUSubtarget::NORTHERN_ISLANDS:
- StackSubEntry += 2;
- break;
- default: llvm_unreachable("Not a VLIW4/VLIW5 GPU");
- }
- return (StackSubEntry + 3)/4; // Need ceil value of StackSubEntry/4
- }
-
public:
R600ControlFlowFinalizer(TargetMachine &tm) : MachineFunctionPass(ID),
TII (0), TRI(0),
@@ -329,23 +440,19 @@ public:
virtual bool runOnMachineFunction(MachineFunction &MF) {
TII=static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
TRI=static_cast<const R600RegisterInfo *>(MF.getTarget().getRegisterInfo());
+ R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
- unsigned MaxStack = 0;
- unsigned CurrentStack = 0;
- unsigned CurrentLoopDepth = 0;
- bool HasPush = false;
+ CFStack CFStack(ST, MFI->ShaderType);
for (MachineFunction::iterator MB = MF.begin(), ME = MF.end(); MB != ME;
++MB) {
MachineBasicBlock &MBB = *MB;
unsigned CfCount = 0;
std::vector<std::pair<unsigned, std::set<MachineInstr *> > > LoopStack;
std::vector<MachineInstr * > IfThenElseStack;
- R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
if (MFI->ShaderType == 1) {
BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
getHWInstrDesc(CF_CALL_FS));
CfCount++;
- MaxStack = 1;
}
std::vector<ClauseFile> FetchClauses, AluClauses;
std::vector<MachineInstr *> LastAlu(1);
@@ -369,15 +476,15 @@ public:
I++;
switch (MI->getOpcode()) {
case AMDGPU::CF_ALU_PUSH_BEFORE:
- CurrentStack++;
- MaxStack = std::max(MaxStack, CurrentStack);
- HasPush = true;
- if (ST.hasCaymanISA() && CurrentLoopDepth > 1) {
+ if (ST.hasCaymanISA() && CFStack.getLoopDepth() > 1) {
BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
.addImm(CfCount + 1)
.addImm(1);
MI->setDesc(TII->get(AMDGPU::CF_ALU));
CfCount++;
+ CFStack.pushBranch(AMDGPU::CF_PUSH_EG);
+ } else {
+ CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE);
}
case AMDGPU::CF_ALU:
I = MI;
@@ -386,9 +493,7 @@ public:
CfCount++;
break;
case AMDGPU::WHILELOOP: {
- CurrentStack+=4;
- CurrentLoopDepth++;
- MaxStack = std::max(MaxStack, CurrentStack);
+ CFStack.pushLoop();
MachineInstr *MIb = BuildMI(MBB, MI, MBB.findDebugLoc(MI),
getHWInstrDesc(CF_WHILE_LOOP))
.addImm(1);
@@ -401,8 +506,7 @@ public:
break;
}
case AMDGPU::ENDLOOP: {
- CurrentStack-=4;
- CurrentLoopDepth--;
+ CFStack.popLoop();
std::pair<unsigned, std::set<MachineInstr *> > Pair =
LoopStack.back();
LoopStack.pop_back();
@@ -440,7 +544,7 @@ public:
break;
}
case AMDGPU::ENDIF: {
- CurrentStack--;
+ CFStack.popBranch();
if (LastAlu.back()) {
ToPopAfter.push_back(LastAlu.back());
} else {
@@ -515,7 +619,7 @@ public:
.addImm(Alu->getOperand(8).getImm());
Alu->eraseFromParent();
}
- MFI->StackSize = getHWStackSize(MaxStack, HasPush);
+ MFI->StackSize = CFStack.MaxStackSize;
}
return false;
diff --git a/test/CodeGen/R600/elf.r600.ll b/test/CodeGen/R600/elf.r600.ll
index 0590efb..4436c07 100644
--- a/test/CodeGen/R600/elf.r600.ll
+++ b/test/CodeGen/R600/elf.r600.ll
@@ -6,7 +6,7 @@
; CONFIG-CHECK: .section .AMDGPU.config
; CONFIG-CHECK-NEXT: .long 166100
-; CONFIG-CHECK-NEXT: .long 258
+; CONFIG-CHECK-NEXT: .long 2
; CONFIG-CHECK-NEXT: .long 165900
; CONFIG-CHECK-NEXT: .long 0
define void @test(float addrspace(1)* %out, i32 %p) {
--
1.8.1.4
-------------- next part --------------
>From 7c1bfad133c4e0465398839923bdfad6d0e38209 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Fri, 6 Dec 2013 16:02:37 -0800
Subject: [PATCH 5/6] R600: Add some missing CF instruction definitions to the
.td files.
---
lib/Target/R600/R600Instructions.td | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
index 3f3fea6..26dfc28 100644
--- a/lib/Target/R600/R600Instructions.td
+++ b/lib/Target/R600/R600Instructions.td
@@ -642,6 +642,9 @@ ins, AsmPrint, [] >, CF_WORD0_EG, CF_WORD1_EG {
def CF_ALU : ALU_CLAUSE<8, "ALU">;
def CF_ALU_PUSH_BEFORE : ALU_CLAUSE<9, "ALU_PUSH_BEFORE">;
def CF_ALU_POP_AFTER : ALU_CLAUSE<10, "ALU_POP_AFTER">;
+def CF_ALU_CONTINUE : ALU_CLAUSE<13, "ALU_CONTINUE">;
+def CF_ALU_BREAK : ALU_CLAUSE<14, "ALU_BREAK">;
+def CF_ALU_ELSE_AFTER : ALU_CLAUSE<15, "ALU_ELSE_AFTER">;
def FETCH_CLAUSE : AMDGPUInst <(outs),
(ins i32imm:$addr), "Fetch clause starting at $addr:", [] > {
@@ -1235,6 +1238,10 @@ let Predicates = [isR600] in {
"JUMP @$ADDR POP:$POP_COUNT"> {
let CNT = 0;
}
+ def CF_PUSH_ELSE_R600 : CF_CLAUSE_R600<12, (ins i32imm:$ADDR),
+ "PUSH_ELSE @$ADDR"> {
+ let CNT = 0;
+ }
def CF_ELSE_R600 : CF_CLAUSE_R600<13, (ins i32imm:$ADDR, i32imm:$POP_COUNT),
"ELSE @$ADDR POP:$POP_COUNT"> {
let CNT = 0;
--
1.8.1.4
-------------- next part --------------
>From 59147cba6d765857e9ab2783d5f17ec6d3cda4a8 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Fri, 6 Dec 2013 16:02:37 -0800
Subject: [PATCH 6/6] R600: Add work-around for the CF stack entry HW bug
The CF stack can be corrupted if you use CF_ALU_PUSH_BEFORE,
CF_ALU_ELSE_AFTER, CF_ALU_BREAK, or CF_ALU_CONTINUE when the number of
sub-entries on the stack is greater than or equal to the stack entry
size and sub-entries modulo 4 is either 0 or 3 (on cedar the bug is
present when number of sub-entries module 8 is either 7 or 0)
We choose to be conservative and always apply the work-around when the
number of sub-enries is greater than or equal to the stack entry size,
so that we can safely over-allocate the stack when we are unsure of the
stack allocation rules.
---
lib/Target/R600/AMDGPU.td | 5 +
lib/Target/R600/AMDGPUSubtarget.cpp | 6 +
lib/Target/R600/AMDGPUSubtarget.h | 2 +
lib/Target/R600/Processors.td | 14 +-
lib/Target/R600/R600ControlFlowFinalizer.cpp | 44 +++++-
test/CodeGen/R600/cf-stack-bug.ll | 225 +++++++++++++++++++++++++++
6 files changed, 289 insertions(+), 7 deletions(-)
create mode 100644 test/CodeGen/R600/cf-stack-bug.ll
diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
index c4e5efc..d1e2cf5 100644
--- a/lib/Target/R600/AMDGPU.td
+++ b/lib/Target/R600/AMDGPU.td
@@ -63,6 +63,11 @@ def FeatureCaymanISA : SubtargetFeature<"caymanISA",
"true",
"Use Cayman ISA">;
+def FeatureCFALUBug : SubtargetFeature<"cfalubug",
+ "CFALUBug",
+ "true",
+ "GPU has CF_ALU bug">;
+
class SubtargetFeatureFetchLimit <string Value> :
SubtargetFeature <"fetch"#Value,
"TexVTXClauseSize",
diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
index 124f7c5..1cfe550 100644
--- a/lib/Target/R600/AMDGPUSubtarget.cpp
+++ b/lib/Target/R600/AMDGPUSubtarget.cpp
@@ -39,6 +39,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
EnableIRStructurizer = true;
EnableIfCvt = true;
WavefrontSize = 0;
+ CFALUBug = false;
ParseSubtargetFeatures(GPU, FS);
DevName = GPU;
}
@@ -97,6 +98,11 @@ AMDGPUSubtarget::getStackEntrySize() const {
}
}
bool
+AMDGPUSubtarget::hasCFAluBug() const {
+ assert(getGeneration() <= NORTHERN_ISLANDS);
+ return CFALUBug;
+}
+bool
AMDGPUSubtarget::isTargetELF() const {
return false;
}
diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
index 789c161..110d979 100644
--- a/lib/Target/R600/AMDGPUSubtarget.h
+++ b/lib/Target/R600/AMDGPUSubtarget.h
@@ -52,6 +52,7 @@ private:
bool EnableIRStructurizer;
bool EnableIfCvt;
unsigned WavefrontSize;
+ bool CFALUBug;
InstrItineraryData InstrItins;
@@ -71,6 +72,7 @@ public:
bool isIfCvtEnabled() const;
unsigned getWavefrontSize() const;
unsigned getStackEntrySize() const;
+ bool hasCFAluBug() const;
virtual bool enableMachineScheduler() const {
return getGeneration() <= NORTHERN_ISLANDS;
diff --git a/lib/Target/R600/Processors.td b/lib/Target/R600/Processors.td
index e601f35..fde4481 100644
--- a/lib/Target/R600/Processors.td
+++ b/lib/Target/R600/Processors.td
@@ -46,13 +46,15 @@ def : Proc<"rv770", R600_VLIW5_Itin,
//===----------------------------------------------------------------------===//
def : Proc<"cedar", R600_VLIW5_Itin,
- [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32]>;
+ [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize32,
+ FeatureCFALUBug]>;
def : Proc<"redwood", R600_VLIW5_Itin,
- [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>;
+ [FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64,
+ FeatureCFALUBug]>;
def : Proc<"sumo", R600_VLIW5_Itin,
- [FeatureEvergreen, FeatureWavefrontSize64]>;
+ [FeatureEvergreen, FeatureWavefrontSize64, FeatureCFALUBug]>;
def : Proc<"juniper", R600_VLIW5_Itin,
[FeatureEvergreen, FeatureVertexCache, FeatureWavefrontSize64]>;
@@ -66,13 +68,13 @@ def : Proc<"cypress", R600_VLIW5_Itin,
//===----------------------------------------------------------------------===//
def : Proc<"barts", R600_VLIW5_Itin,
- [FeatureNorthernIslands, FeatureVertexCache]>;
+ [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
def : Proc<"turks", R600_VLIW5_Itin,
- [FeatureNorthernIslands, FeatureVertexCache]>;
+ [FeatureNorthernIslands, FeatureVertexCache, FeatureCFALUBug]>;
def : Proc<"caicos", R600_VLIW5_Itin,
- [FeatureNorthernIslands]>;
+ [FeatureNorthernIslands, FeatureCFALUBug]>;
def : Proc<"cayman", R600_VLIW4_Itin,
[FeatureNorthernIslands, FeatureFP64, FeatureCaymanISA]>;
diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
index 2d5c4b4..353fce0 100644
--- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
+++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
@@ -73,6 +73,45 @@ bool CFStack::branchStackContains(CFStack::StackItem Item) {
return false;
}
+bool CFStack::requiresWorkAroundForInst(unsigned Opcode) {
+ if (Opcode == AMDGPU::CF_ALU_PUSH_BEFORE && ST.hasCaymanISA() &&
+ getLoopDepth() > 1) {
+ return true;
+ }
+
+ if (!ST.hasCFAluBug())
+ return false;
+
+ switch(Opcode) {
+ default: return false;
+ case AMDGPU::CF_ALU_PUSH_BEFORE:
+ case AMDGPU::CF_ALU_ELSE_AFTER:
+ case AMDGPU::CF_ALU_BREAK:
+ case AMDGPU::CF_ALU_CONTINUE:
+ if (CurrentSubEntries == 0)
+ return false;
+ if (ST.getWavefrontSize() == 64) {
+ // We are being conservative here. We only require this work-around if
+ // CurrentSubEntries > 3 &&
+ // (CurrentSubEntries % 4 == 3 || CurrentSubEntries % 4 == 0)
+ //
+ // We have to be conservative, because we don't know for certain that
+ // our stack allocation algorithm for Evergreen/NI is correct. Applying this
+ // work-around when CurrentSubEntries > 3 allows us to over-allocate stack
+ // resources without any problems.
+ return CurrentSubEntries > 3;
+ } else {
+ assert(ST.getWavefrontSize() == 32);
+ // We are being conservative here. We only require the work-around if
+ // CurrentSubEntries > 7 &&
+ // (CurrentSubEntries % 8 == 7 || CurrentSubEntries % 8 == 0)
+ // See the comment on the wavefront size == 64 case for why we are
+ // being conservative.
+ return CurrentSubEntries > 7;
+ }
+ }
+}
+
unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
switch(Item) {
default:
@@ -474,9 +513,12 @@ public:
if (MI->getOpcode() == AMDGPU::CF_ALU)
LastAlu.back() = MI;
I++;
+ bool RequiresWorkAround =
+ CFStack.requiresWorkAroundForInst(MI->getOpcode());
switch (MI->getOpcode()) {
case AMDGPU::CF_ALU_PUSH_BEFORE:
- if (ST.hasCaymanISA() && CFStack.getLoopDepth() > 1) {
+ if (RequiresWorkAround) {
+ DEBUG(dbgs() << "Applying bug work-around for ALU_PUSH_BEFORE\n");
BuildMI(MBB, MI, MBB.findDebugLoc(MI), TII->get(AMDGPU::CF_PUSH_EG))
.addImm(CfCount + 1)
.addImm(1);
diff --git a/test/CodeGen/R600/cf-stack-bug.ll b/test/CodeGen/R600/cf-stack-bug.ll
new file mode 100644
index 0000000..7fa07b1
--- /dev/null
+++ b/test/CodeGen/R600/cf-stack-bug.ll
@@ -0,0 +1,225 @@
+; RUN: llc -march=r600 -mcpu=redwood -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=sumo -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=barts -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=turks -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=caicos -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG64 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=cedar -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=BUG32 --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=juniper -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=cypress -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC
+; RUN: llc -march=r600 -mcpu=cayman -debug-only=r600cf %s -o - 2>&1 | FileCheck %s --check-prefix=NOBUG --check-prefix=FUNC
+
+; We are currently allocating 2 extra sub-entries on Evergreen / NI for
+; non-WQM push instructions if we change this to 1, then we will need to
+; add one level of depth to each of these tests.
+
+; BUG64-NOT: Applying bug work-around
+; BUG32-NOT: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: @nested3
+define void @nested3(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+ %0 = icmp sgt i32 %cond, 0
+ br i1 %0, label %if.1, label %end
+
+if.1:
+ %1 = icmp sgt i32 %cond, 10
+ br i1 %1, label %if.2, label %if.store.1
+
+if.store.1:
+ store i32 1, i32 addrspace(1)* %out
+ br label %end
+
+if.2:
+ %2 = icmp sgt i32 %cond, 20
+ br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+ store i32 2, i32 addrspace(1)* %out
+ br label %end
+
+if.3:
+ store i32 3, i32 addrspace(1)* %out
+ br label %end
+
+end:
+ ret void
+}
+
+; BUG64: Applying bug work-around
+; BUG32-NOT: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: @nested4
+define void @nested4(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+ %0 = icmp sgt i32 %cond, 0
+ br i1 %0, label %if.1, label %end
+
+if.1:
+ %1 = icmp sgt i32 %cond, 10
+ br i1 %1, label %if.2, label %if.1.store
+
+if.1.store:
+ store i32 1, i32 addrspace(1)* %out
+ br label %end
+
+if.2:
+ %2 = icmp sgt i32 %cond, 20
+ br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+ store i32 2, i32 addrspace(1)* %out
+ br label %end
+
+if.3:
+ %3 = icmp sgt i32 %cond, 30
+ br i1 %3, label %if.4, label %if.3.store
+
+if.3.store:
+ store i32 3, i32 addrspace(1)* %out
+ br label %end
+
+if.4:
+ store i32 4, i32 addrspace(1)* %out
+ br label %end
+
+end:
+ ret void
+}
+
+; BUG64: Applying bug work-around
+; BUG32-NOT: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: @nested7
+define void @nested7(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+ %0 = icmp sgt i32 %cond, 0
+ br i1 %0, label %if.1, label %end
+
+if.1:
+ %1 = icmp sgt i32 %cond, 10
+ br i1 %1, label %if.2, label %if.1.store
+
+if.1.store:
+ store i32 1, i32 addrspace(1)* %out
+ br label %end
+
+if.2:
+ %2 = icmp sgt i32 %cond, 20
+ br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+ store i32 2, i32 addrspace(1)* %out
+ br label %end
+
+if.3:
+ %3 = icmp sgt i32 %cond, 30
+ br i1 %3, label %if.4, label %if.3.store
+
+if.3.store:
+ store i32 3, i32 addrspace(1)* %out
+ br label %end
+
+if.4:
+ %4 = icmp sgt i32 %cond, 40
+ br i1 %4, label %if.5, label %if.4.store
+
+if.4.store:
+ store i32 4, i32 addrspace(1)* %out
+ br label %end
+
+if.5:
+ %5 = icmp sgt i32 %cond, 50
+ br i1 %5, label %if.6, label %if.5.store
+
+if.5.store:
+ store i32 5, i32 addrspace(1)* %out
+ br label %end
+
+if.6:
+ %6 = icmp sgt i32 %cond, 60
+ br i1 %6, label %if.7, label %if.6.store
+
+if.6.store:
+ store i32 6, i32 addrspace(1)* %out
+ br label %end
+
+if.7:
+ store i32 7, i32 addrspace(1)* %out
+ br label %end
+
+end:
+ ret void
+}
+
+; BUG64: Applying bug work-around
+; BUG32: Applying bug work-around
+; NOBUG-NOT: Applying bug work-around
+; FUNC-LABEL: @nested8
+define void @nested8(i32 addrspace(1)* %out, i32 %cond) {
+entry:
+ %0 = icmp sgt i32 %cond, 0
+ br i1 %0, label %if.1, label %end
+
+if.1:
+ %1 = icmp sgt i32 %cond, 10
+ br i1 %1, label %if.2, label %if.1.store
+
+if.1.store:
+ store i32 1, i32 addrspace(1)* %out
+ br label %end
+
+if.2:
+ %2 = icmp sgt i32 %cond, 20
+ br i1 %2, label %if.3, label %if.2.store
+
+if.2.store:
+ store i32 2, i32 addrspace(1)* %out
+ br label %end
+
+if.3:
+ %3 = icmp sgt i32 %cond, 30
+ br i1 %3, label %if.4, label %if.3.store
+
+if.3.store:
+ store i32 3, i32 addrspace(1)* %out
+ br label %end
+
+if.4:
+ %4 = icmp sgt i32 %cond, 40
+ br i1 %4, label %if.5, label %if.4.store
+
+if.4.store:
+ store i32 4, i32 addrspace(1)* %out
+ br label %end
+
+if.5:
+ %5 = icmp sgt i32 %cond, 50
+ br i1 %5, label %if.6, label %if.5.store
+
+if.5.store:
+ store i32 5, i32 addrspace(1)* %out
+ br label %end
+
+if.6:
+ %6 = icmp sgt i32 %cond, 60
+ br i1 %6, label %if.7, label %if.6.store
+
+if.6.store:
+ store i32 6, i32 addrspace(1)* %out
+ br label %end
+
+if.7:
+ %7 = icmp sgt i32 %cond, 70
+ br i1 %7, label %if.8, label %if.7.store
+
+if.7.store:
+ store i32 7, i32 addrspace(1)* %out
+ br label %end
+
+if.8:
+ store i32 8, i32 addrspace(1)* %out
+ br label %end
+
+end:
+ ret void
+}
--
1.8.1.4
More information about the llvm-commits
mailing list