R600 Patches: Private memory fixes

Wed Jun 5 11:17:34 PDT 2013

Hi,

The attached patches fix a few bugs with OpenCL private memory in the
R600 backend.

-Tom
-------------- next part --------------
>From a7882a7cac41d13755b2630f4e32aa59dc7722c0 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Mon, 3 Jun 2013 20:34:13 -0700
Subject: [PATCH 1/2] R600: Fix calculation of stack offset in
 AMDGPUFrameLowering

We weren't computing structure size correctly and we were relying on
the original alloca instruction to compute the offset, which isn't
always reliable.
---
 lib/Target/R600/AMDGPUFrameLowering.cpp  | 23 ++--------------------
 test/CodeGen/R600/indirect-addressing.ll | 33 ++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 21 deletions(-)

diff --git a/lib/Target/R600/AMDGPUFrameLowering.cpp b/lib/Target/R600/AMDGPUFrameLowering.cpp
index 815d6f7..40f14d2 100644
--- a/lib/Target/R600/AMDGPUFrameLowering.cpp
+++ b/lib/Target/R600/AMDGPUFrameLowering.cpp
@@ -78,27 +78,8 @@ int AMDGPUFrameLowering::getFrameIndexOffset(const MachineFunction &MF,
   int UpperBound = FI == -1 ? MFI->getNumObjects() : FI;
 
   for (int i = MFI->getObjectIndexBegin(); i < UpperBound; ++i) {
-    const AllocaInst *Alloca = MFI->getObjectAllocation(i);
-    unsigned ArrayElements;
-    const Type *AllocaType = Alloca->getAllocatedType();
-    const Type *ElementType;
-
-    if (AllocaType->isArrayTy()) {
-      ArrayElements = AllocaType->getArrayNumElements();
-      ElementType = AllocaType->getArrayElementType();
-    } else {
-      ArrayElements = 1;
-      ElementType = AllocaType;
-    }
-
-    unsigned VectorElements;
-    if (ElementType->isVectorTy()) {
-      VectorElements = ElementType->getVectorNumElements();
-    } else {
-      VectorElements = 1;
-    }
-
-    Offset += (VectorElements / getStackWidth(MF)) * ArrayElements;
+    unsigned Size = MFI->getObjectSize(i);
+    Offset += (Size / (getStackWidth(MF) * 4));
   }
   return Offset;
 }
diff --git a/test/CodeGen/R600/indirect-addressing.ll b/test/CodeGen/R600/indirect-addressing.ll
index 7291cb4..bd72cd9 100644
--- a/test/CodeGen/R600/indirect-addressing.ll
+++ b/test/CodeGen/R600/indirect-addressing.ll
@@ -30,3 +30,36 @@ entry:
   store i32 %3, i32 addrspace(1)* %arrayidx13
   ret void
 }
+
+; This test checks that the stack offset is calculated correctly for structs.
+; All register loads/stores should be optimized away, so there shouldn't be
+; any MOVA instructions.
+;
+; XXX: This generated code has unnecessary MOVs, we should be able to optimize
+; this.
+
+; CHECK: @multiple_structs
+; CHECK-NOT: MOVA_INT
+
+%struct.point = type { i32, i32 }
+
+define void @multiple_structs(i32 addrspace(1)* %out) {
+entry:
+  %a = alloca %struct.point
+  %b = alloca %struct.point
+  %a.x.ptr = getelementptr %struct.point* %a, i32 0, i32 0
+  %a.y.ptr = getelementptr %struct.point* %a, i32 0, i32 1
+  %b.x.ptr = getelementptr %struct.point* %b, i32 0, i32 0
+  %b.y.ptr = getelementptr %struct.point* %b, i32 0, i32 1
+  store i32 0, i32* %a.x.ptr
+  store i32 1, i32* %a.y.ptr
+  store i32 2, i32* %b.x.ptr
+  store i32 3, i32* %b.y.ptr
+  %a.indirect.ptr = getelementptr %struct.point* %a, i32 0, i32 0
+  %b.indirect.ptr = getelementptr %struct.point* %b, i32 0, i32 0
+  %a.indirect = load i32* %a.indirect.ptr
+  %b.indirect = load i32* %b.indirect.ptr
+  %0 = add i32 %a.indirect, %b.indirect
+  store i32 %0, i32 addrspace(1)* %out
+  ret void
+}
-- 
1.7.11.4

-------------- next part --------------
>From d324ce7ab44abe30d7ea7d0aeaafa26aaaa39ef7 Mon Sep 17 00:00:00 2001
From: Tom Stellard <thomas.stellard at amd.com>
Date: Tue, 4 Jun 2013 11:11:18 -0700
Subject: [PATCH 2/2] R600: Fix indirect address calculation for loops

---
 lib/Target/R600/AMDGPUIndirectAddressing.cpp | 183 ++++++++++++++++++---------
 test/CodeGen/R600/indirect-addressing.ll     |  39 ++++++
 2 files changed, 160 insertions(+), 62 deletions(-)

diff --git a/lib/Target/R600/AMDGPUIndirectAddressing.cpp b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
index ed6c8ec..988c87b 100644
--- a/lib/Target/R600/AMDGPUIndirectAddressing.cpp
+++ b/lib/Target/R600/AMDGPUIndirectAddressing.cpp
@@ -18,9 +18,11 @@
 #include "AMDGPU.h"
 #include "R600InstrInfo.h"
 #include "R600MachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 
@@ -33,6 +35,13 @@ class AMDGPUIndirectAddressingPass : public MachineFunctionPass {
 private:
   static char ID;
   const AMDGPUInstrInfo *TII;
+  MachineDominatorTree *DT;
+  MachinePostDominatorTree *PDT;
+
+  struct AddressInfo {
+    unsigned Address;
+    std::vector<unsigned> Defs;
+  };
 
   bool regHasExplicitDef(MachineRegisterInfo &MRI, unsigned Reg) const;
 
@@ -44,6 +53,14 @@ public:
 
   virtual bool runOnMachineFunction(MachineFunction &MF);
 
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
+    AU.addPreserved<MachinePostDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
   const char *getPassName() const { return "R600 Handle indirect addressing"; }
 
 };
@@ -58,6 +75,8 @@ FunctionPass *llvm::createAMDGPUIndirectAddressingPass(TargetMachine &tm) {
 
 bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  DT = &getAnalysis<MachineDominatorTree>();
+  PDT = &getAnalysis<MachinePostDominatorTree>();
 
   int IndirectBegin = TII->getIndirectIndexBegin(MF);
   int IndirectEnd = TII->getIndirectIndexEnd(MF);
@@ -124,17 +143,6 @@ bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
       }
       MI.eraseFromParent();
     }
-
-    // Update the live-ins of the succesor blocks
-    for (MachineBasicBlock::succ_iterator Succ = MBB.succ_begin(),
-                                          SuccEnd = MBB.succ_end();
-                                          SuccEnd != Succ; ++Succ) {
-      std::map<unsigned, unsigned>::const_iterator Key, KeyEnd;
-      for (Key = LiveAddressRegisterMap.begin(),
-           KeyEnd = LiveAddressRegisterMap.end(); KeyEnd != Key; ++Key) {
-        (*Succ)->addLiveIn(Key->second);
-      }
-    }
   }
 
   // Second pass - Lower the RegisterLoad instructions
@@ -142,67 +150,115 @@ bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
                                                       BB != BB_E; ++BB) {
     // Key is the address and the value is the register
     std::map<unsigned, unsigned> LiveAddressRegisterMap;
+
+    std::map<unsigned, struct AddressInfo> AddressInfoMap;
     MachineBasicBlock &MBB = *BB;
 
-    MachineBasicBlock::livein_iterator LI = MBB.livein_begin();
-    while (LI != MBB.livein_end()) {
-      std::vector<unsigned> PhiRegisters;
+    for (std::map<unsigned, unsigned>::iterator I = RegisterAddressMap.begin(),
+                                                E = RegisterAddressMap.end();
+                                                I != E; ++I) {
+      unsigned Reg = I->first;
+      unsigned Address = I->second;
+      MachineInstr *DefInstr = MRI.getVRegDef(Reg);
+      MachineBasicBlock *DefBlock = DefInstr->getParent();
+      assert(DefInstr);
+
+      // This def occurs 'after' the current basic block, so we can ignore it.
+      if (DT->dominates(&MBB, DefBlock)) {
+        continue;
+      }
 
-      // Make sure this live in is used for indirect addressing
-      if (RegisterAddressMap.find(*LI) == RegisterAddressMap.end()) {
-        ++LI;
+      // Check if the instruction was defined in a block that is not reachable
+      // from the current block.
+      MachineBasicBlock *NCD = DT->findNearestCommonDominator(&MBB, DefBlock);
+      MachineBasicBlock *PNCD = PDT->findNearestCommonDominator(&MBB, DefBlock);
+      if (NCD != &MBB && NCD != DefBlock && PNCD != &MBB && PNCD != DefBlock) {
         continue;
       }
 
-      unsigned Address = RegisterAddressMap[*LI];
-      LiveAddressRegisterMap[Address] = *LI;
-      PhiRegisters.push_back(*LI);
-
-      // Check if there are other live in registers which map to the same
-      // indirect address.
-      for (MachineBasicBlock::livein_iterator LJ = llvm::next(LI),
-                                              LE = MBB.livein_end();
-                                              LJ != LE; ++LJ) {
-        unsigned Reg = *LJ;
-        if (RegisterAddressMap.find(Reg) == RegisterAddressMap.end()) {
-          continue;
-        }
+      if (AddressInfoMap.count(Address) == 0) {
+        struct AddressInfo Info;
+        Info.Address = Address;
+        Info.Defs.push_back(Reg);
+        AddressInfoMap[Address] = Info;
+        continue;
+      }
 
-        if (RegisterAddressMap[Reg] == Address) {
-          PhiRegisters.push_back(Reg);
+      struct AddressInfo &Info = AddressInfoMap[Address];
+      bool IsLive = true;
+
+      for (std::vector<unsigned>::iterator I = Info.Defs.begin(),
+                                           E = Info.Defs.end();
+                                           I != E; ++I) {
+        MachineInstr *LiveDefInstr = MRI.getVRegDef(*I);
+        if (DT->dominates(DefInstr->getParent(), LiveDefInstr->getParent())) {
+          IsLive = false;
+          break;
         }
       }
 
-      if (PhiRegisters.size() == 1) {
+      if (!IsLive) {
+        continue;
+      }
+      Info.Defs.push_back(Reg);
+    }
+
+    for (std::map<unsigned, struct AddressInfo>::iterator
+                            I = AddressInfoMap.begin(), E = AddressInfoMap.end();
+                            I != E; ++I) {
+      const struct AddressInfo &Info = I->second;
+      if (Info.Defs.size() == 1) {
         // We don't need to insert a Phi instruction, so we can just add the
         // registers to the live list for the block.
-        LiveAddressRegisterMap[Address] = *LI;
-        MBB.removeLiveIn(*LI);
-      } else {
-        // We need to insert a PHI, because we have the same address being
-        // written in multiple predecessor blocks.
-        const TargetRegisterClass *PhiDstClass =
-                   TII->getIndirectAddrStoreRegClass(*(PhiRegisters.begin()));
-        unsigned PhiDstReg = MRI.createVirtualRegister(PhiDstClass);
-        MachineInstrBuilder Phi = BuildMI(MBB, MBB.begin(),
+        LiveAddressRegisterMap[Info.Address] = Info.Defs[0];
+        continue;
+      }
+
+      std::map<unsigned, MachineBasicBlock*> PhiOperands;
+      for (std::vector<unsigned>::const_iterator RI = Info.Defs.begin(),
+                                                 RE = Info.Defs.end();
+                                                 RI != RE; ++RI) {
+        unsigned DefReg = *RI;
+        MachineInstr *DefInstr = MRI.getVRegDef(DefReg);
+        assert(DefInstr);
+
+        if (MBB.isPredecessor(DefInstr->getParent())) {
+          PhiOperands[DefReg] = DefInstr->getParent();
+          continue;
+        }
+
+        bool FoundBB = false;
+        for (MachineBasicBlock::const_pred_iterator PI = MBB.pred_begin(),
+                                                    PE = MBB.pred_end();
+                                                    PI != PE; ++PI) {
+          if (DT->dominates(DefInstr->getParent(), *PI)) {
+            FoundBB = true;
+            PhiOperands[DefReg] = *PI;
+            break;
+          }
+        }
+
+        assert(FoundBB &&
+              "Unhandled CFG graph in AMDGPUIndirectAddressing Pass.");
+      }
+
+      const TargetRegisterClass *PhiDstClass =
+                     TII->getIndirectAddrStoreRegClass(PhiOperands.begin()->first);
+      unsigned PhiDstReg = MRI.createVirtualRegister(PhiDstClass);
+
+      // We need to insert a PHI, because we have the same address being
+      // written in multiple predecessor blocks.
+      MachineInstrBuilder Phi = BuildMI(MBB, MBB.begin(),
                                           MBB.findDebugLoc(MBB.begin()),
                                           TII->get(AMDGPU::PHI), PhiDstReg);
 
-        for (std::vector<unsigned>::const_iterator RI = PhiRegisters.begin(),
-                                                   RE = PhiRegisters.end();
-                                                   RI != RE; ++RI) {
-          unsigned Reg = *RI;
-          MachineInstr *DefInst = MRI.getVRegDef(Reg);
-          assert(DefInst);
-          MachineBasicBlock *RegBlock = DefInst->getParent();
-          Phi.addReg(Reg);
-          Phi.addMBB(RegBlock);
-          MBB.removeLiveIn(Reg);
-        }
-        RegisterAddressMap[PhiDstReg] = Address;
-        LiveAddressRegisterMap[Address] = PhiDstReg;
+      for (std::map<unsigned, MachineBasicBlock *>::iterator
+                               PI = PhiOperands.begin(), PE = PhiOperands.end();
+                               PI != PE; ++PI) {
+          Phi.addReg(PI->first);
+          Phi.addMBB(PI->second);
       }
-      LI = MBB.livein_begin();
+      LiveAddressRegisterMap[Info.Address] = PhiDstReg;
     }
 
     for (MachineBasicBlock::iterator I = MBB.begin(), Next = llvm::next(I);
@@ -224,13 +280,16 @@ bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
             unsigned LiveAddress = RegisterAddressMap[Reg];
             // Chain the live-ins
             if (LiveAddressRegisterMap.find(LiveAddress) !=
-                                                     RegisterAddressMap.end()) {
+                                                     RegisterAddressMap.end() &&
+                LiveAddressRegisterMap[LiveAddress] != 0 &&
+                LiveAddressRegisterMap[LiveAddress] != Reg) {
               MI.addOperand(MachineOperand::CreateReg(
-                                  LiveAddressRegisterMap[LiveAddress],
-                                  false, // isDef
-                                  true,  // isImp
-                                  true));  // isKill
+                                LiveAddressRegisterMap[LiveAddress],
+                                false, // isDef
+                                true,  // isImp
+                                true));  // isKill
             }
+            assert(Reg);
             LiveAddressRegisterMap[LiveAddress] = Reg;
           }
         }
@@ -250,7 +309,6 @@ bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
       if (MI.getOperand(1).getReg() == AMDGPU::INDIRECT_BASE_ADDR) {
         // Direct register access
         unsigned Reg = LiveAddressRegisterMap[Address];
-        unsigned AddrReg = IndirectLoadRegClass->getRegister(Address);
 
         if (regHasExplicitDef(MRI, Reg)) {
           // If the register we are reading from has an explicit def, then that
@@ -265,6 +323,7 @@ bool AMDGPUIndirectAddressingPass::runOnMachineFunction(MachineFunction &MF) {
           // If the register we are reading has an implicit def, then that
           // means it was written by an indirect register access (i.e. An
           // instruction that uses indirect addressing. 
+          unsigned AddrReg = IndirectLoadRegClass->getRegister(Address);
           BuildMI(MBB, I, MBB.findDebugLoc(I), TII->get(AMDGPU::COPY),
                    MI.getOperand(0).getReg())
                    .addReg(AddrReg)
diff --git a/test/CodeGen/R600/indirect-addressing.ll b/test/CodeGen/R600/indirect-addressing.ll
index bd72cd9..1ef6c35 100644
--- a/test/CodeGen/R600/indirect-addressing.ll
+++ b/test/CodeGen/R600/indirect-addressing.ll
@@ -63,3 +63,42 @@ entry:
   store i32 %0, i32 addrspace(1)* %out
   ret void
 }
+
+; Test direct access of a private array inside a loop.  The private array
+; loads and stores should be lowered to copies, so there shouldn't be any
+; MOVA instructions.
+
+; CHECK: @direct_loop
+; CHECK-NOT: MOVA_INT
+
+define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+entry:
+  %prv_array_const = alloca [2 x i32]
+  %prv_array = alloca [2 x i32]
+  %a = load i32 addrspace(1)* %in
+  %b_src_ptr = getelementptr i32 addrspace(1)* %in, i32 1
+  %b = load i32 addrspace(1)* %b_src_ptr
+  %a_dst_ptr = getelementptr [2 x i32]* %prv_array_const, i32 0, i32 0
+  store i32 %a, i32* %a_dst_ptr
+  %b_dst_ptr = getelementptr [2 x i32]* %prv_array_const, i32 0, i32 1
+  store i32 %b, i32* %b_dst_ptr
+  br label %for.body
+
+for.body:
+  %inc = phi i32 [0, %entry], [%count, %for.body]
+  %x_ptr = getelementptr [2 x i32]* %prv_array_const, i32 0, i32 0
+  %x = load i32* %x_ptr
+  %y_ptr = getelementptr [2 x i32]* %prv_array, i32 0, i32 0
+  %y = load i32* %y_ptr
+  %xy = add i32 %x, %y
+  store i32 %xy, i32* %y_ptr
+  %count = add i32 %inc, 1
+  %done = icmp eq i32 %count, 4095
+  br i1 %done, label %for.end, label %for.body
+
+for.end:
+  %value_ptr = getelementptr [2 x i32]* %prv_array, i32 0, i32 0
+  %value = load i32* %value_ptr
+  store i32 %value, i32 addrspace(1)* %out
+  ret void
+}
-- 
1.7.11.4