R600 Patches: Private memory fixes
Tom Stellard
tom at stellard.net
Wed Jun 5 11:17:34 PDT 2013
Hi,
The attached patches fix a few bugs with OpenCL private memory in the
R600 backend.
-Tom
-------------- next part --------------
>From a7882a7cac41d13755b2630f4e32aa59dc7722c0 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Mon, 3 Jun 2013 20:34:13 -0700
Subject: [PATCH 1/2] R600: Fix calculation of stack offset in
AMDGPUFrameLowering
We weren't computing structure size correctly and we were relying on
the original alloca instruction to compute the offset, which isn't
always reliable.
---
lib/Target/R600/AMDGPUFrameLowering.cpp | 23 ++--------------------
test/CodeGen/R600/indirect-addressing.ll | 33 ++++++++++++++++++++++++++++++++
2 files changed, 35 insertions(+), 21 deletions(-)
diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
index 815d6f7..40f14d2 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.cpp
+++ b/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -78,27 +78,8 @@ int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
- const AllocaInst *Alloca = MFI->getObjectAllocation(i);
- unsigned ArrayElements;
- const Type *AllocaType = Alloca->getAllocatedType();
- const Type *ElementType;
-
- if (AllocaType->isArrayTy()) {
- ArrayElements = AllocaType->getArrayNumElements();
- ElementType = AllocaType->getArrayElementType();
- } else {
- ArrayElements = 1;
- ElementType = AllocaType;
- }
-
- unsigned VectorElements;
- if (ElementType->isVectorTy()) {
- VectorElements = ElementType->getVectorNumElements();
- } else {
- VectorElements = 1;
- }
-
- Offset += (VectorElements / getStackWidth(MF)) * ArrayElements;
+ unsigned Size = MFI->getObjectSize(i);
+ Offset += (Size / (getStackWidth(MF) * 4));
}
return Offset;
}
diff --git a/test/CodeGen/R600/indirect-addressing.ll b/test/CodeGen/R600/indirect-addressing.ll
index 7291cb4..bd72cd9 100644
--- a/test/CodeGen/R600/indirect-addressing.ll
+++ b/test/CodeGen/R600/indirect-addressing.ll
@@ -30,3 +30,36 @@ entry:
store i32 %3, i32 addrspace(1)* %arrayidx13
ret void
}
+
+; This test checks that the stack offset is calculated correctly for structs.
+; All register loads/stores should be optimized away, so there shouldn't be
+; any MOVA instructions.
+;
+; XXX: This generated code has unnecessary MOVs, we should be able to optimize
+; this.
+
+; CHECK: @multiple_structs
+; CHECK-NOT: MOVA_INT
+
+%struct.point = type { i32, i32 }
+
+define void @multiple_structs(i32 addrspace(1)* %out) {
+entry:
+ %a = alloca %struct.point
+ %b = alloca %struct.point
+ %a.x.ptr = getelementptr %struct.point* %a, i32 0, i32 0
+ %a.y.ptr = getelementptr %struct.point* %a, i32 0, i32 1
+ %b.x.ptr = getelementptr %struct.point* %b, i32 0, i32 0
+ %b.y.ptr = getelementptr %struct.point* %b, i32 0, i32 1
+ store i32 0, i32* %a.x.ptr
+ store i32 1, i32* %a.y.ptr
+ store i32 2, i32* %b.x.ptr
+ store i32 3, i32* %b.y.ptr
+ %a.indirect.ptr = getelementptr %struct.point* %a, i32 0, i32 0
+ %b.indirect.ptr = getelementptr %struct.point* %b, i32 0, i32 0
+ %a.indirect = load i32* %a.indirect.ptr
+ %b.indirect = load i32* %b.indirect.ptr
+ %0 = add i32 %a.indirect, %b.indirect
+ store i32 %0, i32 addrspace(1)* %out
+ ret void
+}
--
1.7.11.4
-------------- next part --------------
>From d324ce7ab44abe30d7ea7d0aeaafa26aaaa39ef7 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 4 Jun 2013 11:11:18 -0700
Subject: [PATCH 2/2] R600: Fix indirect address calculation for loops
---
lib/Target/R600/AMDGPUIndirectAddressing.cpp | 183 ++++++++++++++++++---------
test/CodeGen/R600/indirect-addressing.ll | 39 ++++++
2 files changed, 160 insertions(+), 62 deletions(-)
diff --git a/lib/Target/R600/AMDGPUIndirectAddressing.cpp b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
index ed6c8ec..988c87b 100644
--- a/lib/Target/R600/AMDGPUIndirectAddressing.cpp
+++ b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
@@ -18,9 +18,11 @@
#include "AMDGPU.h"
#include "R600InstrInfo.h"
#include "R600MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/Support/Debug.h"
@@ -33,6 +35,13 @@ class AMDGPUIndirectAddressingPass : public MachineFunctionPass {
private:
static char ID;
const AMDGPUInstrInfo *TII;
+ MachineDominatorTree *DT;
+ MachinePostDominatorTree *PDT;
+
+ struct AddressInfo {
+ unsigned Address;
+ std::vector<unsigned> Defs;
+ };
bool regHasExplicitDef(MachineRegisterInfo &MRI, unsigned Reg) const;
@@ -44,6 +53,14 @@ public:
virtual bool runOnMachineFunction(MachineFunction &MF);
+ virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+ AU.addRequired<MachineDominatorTree>();
+ AU.addPreserved<MachineDominatorTree>();
+ AU.addRequired<MachinePostDominatorTree>();
+ AU.addPreserved<MachinePostDominatorTree>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
const char *getPassName() const { return "R600 Handle indirect addressing"; }
};
@@ -58,6 +75,8 @@ FunctionPass *llvm::createAMDGPUIndirectAddressingPass(TargetMachine &tm) {
bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
MachineRegisterInfo &MRI = MF.getRegInfo();
+ DT = &getAnalysis<MachineDominatorTree>();
+ PDT = &getAnalysis<MachinePostDominatorTree>();
int IndirectBegin = TII->getIndirectIndexBegin(MF);
int IndirectEnd = TII->getIndirectIndexEnd(MF);
@@ -124,17 +143,6 @@ bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
}
MI.eraseFromParent();
}
-
- // Update the live-ins of the succesor blocks
- for (MachineBasicBlock::succ_iterator Succ = MBB.succ_begin(),
- SuccEnd = MBB.succ_end();
- SuccEnd != Succ; ++Succ) {
- std::map<unsigned, unsigned>::const_iterator Key, KeyEnd;
- for (Key = LiveAddressRegisterMap.begin(),
- KeyEnd = LiveAddressRegisterMap.end(); KeyEnd != Key; ++Key) {
- (*Succ)->addLiveIn(Key->second);
- }
- }
}
// Second pass - Lower the RegisterLoad instructions
@@ -142,67 +150,115 @@ bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
BB != BB_E; ++BB) {
// Key is the address and the value is the register
std::map<unsigned, unsigned> LiveAddressRegisterMap;
+
+ std::map<unsigned, struct AddressInfo> AddressInfoMap;
MachineBasicBlock &MBB = *BB;
- MachineBasicBlock::livein_iterator LI = MBB.livein_begin();
- while (LI != MBB.livein_end()) {
- std::vector<unsigned> PhiRegisters;
+ for (std::map<unsigned, unsigned>::iterator I = RegisterAddressMap.begin(),
+ E = RegisterAddressMap.end();
+ I != E; ++I) {
+ unsigned Reg = I->first;
+ unsigned Address = I->second;
+ MachineInstr *DefInstr = MRI.getVRegDef(Reg);
+ MachineBasicBlock *DefBlock = DefInstr->getParent();
+ assert(DefInstr);
+
+ // This def occurs 'after' the current basic block, so we can ignore it.
+ if (DT->dominates(&MBB, DefBlock)) {
+ continue;
+ }
- // Make sure this live in is used for indirect addressing
- if (RegisterAddressMap.find(*LI) == RegisterAddressMap.end()) {
- ++LI;
+ // Check if the instruction was defined in a block that is not reachable
+ // from the current block.
+ MachineBasicBlock *NCD = DT->findNearestCommonDominator(&MBB, DefBlock);
+ MachineBasicBlock *PNCD = PDT->findNearestCommonDominator(&MBB, DefBlock);
+ if (NCD != &MBB && NCD != DefBlock && PNCD != &MBB && PNCD != DefBlock) {
continue;
}
- unsigned Address = RegisterAddressMap[*LI];
- LiveAddressRegisterMap[Address] = *LI;
- PhiRegisters.push_back(*LI);
-
- // Check if there are other live in registers which map to the same
- // indirect address.
- for (MachineBasicBlock::livein_iterator LJ = llvm::next(LI),
- LE = MBB.livein_end();
- LJ != LE; ++LJ) {
- unsigned Reg = *LJ;
- if (RegisterAddressMap.find(Reg) == RegisterAddressMap.end()) {
- continue;
- }
+ if (AddressInfoMap.count(Address) == 0) {
+ struct AddressInfo Info;
+ Info.Address = Address;
+ Info.Defs.push_back(Reg);
+ AddressInfoMap[Address] = Info;
+ continue;
+ }
- if (RegisterAddressMap[Reg] == Address) {
- PhiRegisters.push_back(Reg);
+ struct AddressInfo &Info = AddressInfoMap[Address];
+ bool IsLive = true;
+
+ for (std::vector<unsigned>::iterator I = Info.Defs.begin(),
+ E = Info.Defs.end();
+ I != E; ++I) {
+ MachineInstr *LiveDefInstr = MRI.getVRegDef(*I);
+ if (DT->dominates(DefInstr->getParent(), LiveDefInstr->getParent())) {
+ IsLive = false;
+ break;
}
}
- if (PhiRegisters.size() == 1) {
+ if (!IsLive) {
+ continue;
+ }
+ Info.Defs.push_back(Reg);
+ }
+
+ for (std::map<unsigned, struct AddressInfo>::iterator
+ I = AddressInfoMap.begin(), E = AddressInfoMap.end();
+ I != E; ++I) {
+ const struct AddressInfo &Info = I->second;
+ if (Info.Defs.size() == 1) {
// We don't need to insert a Phi instruction, so we can just add the
// registers to the live list for the block.
- LiveAddressRegisterMap[Address] = *LI;
- MBB.removeLiveIn(*LI);
- } else {
- // We need to insert a PHI, because we have the same address being
- // written in multiple predecessor blocks.
- const TargetRegisterClass *PhiDstClass =
- TII->getIndirectAddrStoreRegClass(*(PhiRegisters.begin()));
- unsigned PhiDstReg = MRI.createVirtualRegister(PhiDstClass);
- MachineInstrBuilder Phi = BuildMI(MBB, MBB.begin(),
+ LiveAddressRegisterMap[Info.Address] = Info.Defs[0];
+ continue;
+ }
+
+ std::map<unsigned, MachineBasicBlock*> PhiOperands;
+ for (std::vector<unsigned>::const_iterator RI = Info.Defs.begin(),
+ RE = Info.Defs.end();
+ RI != RE; ++RI) {
+ unsigned DefReg = *RI;
+ MachineInstr *DefInstr = MRI.getVRegDef(DefReg);
+ assert(DefInstr);
+
+ if (MBB.isPredecessor(DefInstr->getParent())) {
+ PhiOperands[DefReg] = DefInstr->getParent();
+ continue;
+ }
+
+ bool FoundBB = false;
+ for (MachineBasicBlock::const_pred_iterator PI = MBB.pred_begin(),
+ PE = MBB.pred_end();
+ PI != PE; ++PI) {
+ if (DT->dominates(DefInstr->getParent(), *PI)) {
+ FoundBB = true;
+ PhiOperands[DefReg] = *PI;
+ break;
+ }
+ }
+
+ assert(FoundBB &&
+ "Unhandled CFG graph in AMDGPUIndirectAddressing Pass.");
+ }
+
+ const TargetRegisterClass *PhiDstClass =
+ TII->getIndirectAddrStoreRegClass(PhiOperands.begin()->first);
+ unsigned PhiDstReg = MRI.createVirtualRegister(PhiDstClass);
+
+ // We need to insert a PHI, because we have the same address being
+ // written in multiple predecessor blocks.
+ MachineInstrBuilder Phi = BuildMI(MBB, MBB.begin(),
MBB.findDebugLoc(MBB.begin()),
TII->get(AMDGPU::PHI), PhiDstReg);
- for (std::vector<unsigned>::const_iterator RI = PhiRegisters.begin(),
- RE = PhiRegisters.end();
- RI != RE; ++RI) {
- unsigned Reg = *RI;
- MachineInstr *DefInst = MRI.getVRegDef(Reg);
- assert(DefInst);
- MachineBasicBlock *RegBlock = DefInst->getParent();
- Phi.addReg(Reg);
- Phi.addMBB(RegBlock);
- MBB.removeLiveIn(Reg);
- }
- RegisterAddressMap[PhiDstReg] = Address;
- LiveAddressRegisterMap[Address] = PhiDstReg;
+ for (std::map<unsigned, MachineBasicBlock *>::iterator
+ PI = PhiOperands.begin(), PE = PhiOperands.end();
+ PI != PE; ++PI) {
+ Phi.addReg(PI->first);
+ Phi.addMBB(PI->second);
}
- LI = MBB.livein_begin();
+ LiveAddressRegisterMap[Info.Address] = PhiDstReg;
}
for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
@@ -224,13 +280,16 @@ bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
unsigned LiveAddress = RegisterAddressMap[Reg];
// Chain the live-ins
if (LiveAddressRegisterMap.find(LiveAddress) !=
- RegisterAddressMap.end()) {
+ RegisterAddressMap.end() &&
+ LiveAddressRegisterMap[LiveAddress] != 0 &&
+ LiveAddressRegisterMap[LiveAddress] != Reg) {
MI.addOperand(MachineOperand::CreateReg(
- LiveAddressRegisterMap[LiveAddress],
- false, // isDef
- true, // isImp
- true)); // isKill
+ LiveAddressRegisterMap[LiveAddress],
+ false, // isDef
+ true, // isImp
+ true)); // isKill
}
+ assert(Reg);
LiveAddressRegisterMap[LiveAddress] = Reg;
}
}
@@ -250,7 +309,6 @@ bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) {
// Direct register access
unsigned Reg = LiveAddressRegisterMap[Address];
- unsigned AddrReg = IndirectLoadRegClass->getRegister(Address);
if (regHasExplicitDef(MRI, Reg)) {
// If the register we are reading from has an explicit def, then that
@@ -265,6 +323,7 @@ bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
// If the register we are reading has an implicit def, then that
// means it was written by an indirect register access (i.e. An
// instruction that uses indirect addressing.
+ unsigned AddrReg = IndirectLoadRegClass->getRegister(Address);
BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY),
MI.getOperand(0).getReg())
.addReg(AddrReg)
diff --git a/test/CodeGen/R600/indirect-addressing.ll b/test/CodeGen/R600/indirect-addressing.ll
index bd72cd9..1ef6c35 100644
--- a/test/CodeGen/R600/indirect-addressing.ll
+++ b/test/CodeGen/R600/indirect-addressing.ll
@@ -63,3 +63,42 @@ entry:
store i32 %0, i32 addrspace(1)* %out
ret void
}
+
+; Test direct access of a private array inside a loop. The private array
+; loads and stores should be lowered to copies, so there shouldn't be any
+; MOVA instructions.
+
+; CHECK: @direct_loop
+; CHECK-NOT: MOVA_INT
+
+define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+ %prv_array_const = alloca [2 x i32]
+ %prv_array = alloca [2 x i32]
+ %a = load i32 addrspace(1)* %in
+ %b_src_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+ %b = load i32 addrspace(1)* %b_src_ptr
+ %a_dst_ptr = getelementptr [2 x i32]* %prv_array_const, i32 0, i32 0
+ store i32 %a, i32* %a_dst_ptr
+ %b_dst_ptr = getelementptr [2 x i32]* %prv_array_const, i32 0, i32 1
+ store i32 %b, i32* %b_dst_ptr
+ br label %for.body
+
+for.body:
+ %inc = phi i32 [0, %entry], [%count, %for.body]
+ %x_ptr = getelementptr [2 x i32]* %prv_array_const, i32 0, i32 0
+ %x = load i32* %x_ptr
+ %y_ptr = getelementptr [2 x i32]* %prv_array, i32 0, i32 0
+ %y = load i32* %y_ptr
+ %xy = add i32 %x, %y
+ store i32 %xy, i32* %y_ptr
+ %count = add i32 %inc, 1
+ %done = icmp eq i32 %count, 4095
+ br i1 %done, label %for.end, label %for.body
+
+for.end:
+ %value_ptr = getelementptr [2 x i32]* %prv_array, i32 0, i32 0
+ %value = load i32* %value_ptr
+ store i32 %value, i32 addrspace(1)* %out
+ ret void
+}
--
1.7.11.4
More information about the llvm-commits
mailing list