[PATCH] R600: Use LDS and vectors for private memory

Tom Stellard tom at stellard.net
Tue May 13 15:35:56 PDT 2014


On Tue, May 13, 2014 at 10:25:38AM -0700, Matt Arsenault wrote:
> 
> On May 13, 2014, at 7:33 AM, Tom Stellard <thomas.stellard at amd.com> wrote:
> 
> > ---
> > lib/Target/R600/AMDGPU.h                   |   2 +
> > lib/Target/R600/AMDGPU.td                  |  24 +-
> > lib/Target/R600/AMDGPUISelDAGToDAG.cpp     |   8 +-
> > lib/Target/R600/AMDGPUISelLowering.cpp     |   1 +
> > lib/Target/R600/AMDGPUISelLowering.h       |   1 +
> > lib/Target/R600/AMDGPUPromoteAlloca.cpp    | 386 +++++++++++++++++++++++++++++
> > lib/Target/R600/AMDGPUSubtarget.cpp        |   5 +
> > lib/Target/R600/AMDGPUSubtarget.h          |   2 +
> > lib/Target/R600/AMDGPUTargetMachine.cpp    |   8 +
> > lib/Target/R600/CMakeLists.txt             |   1 +
> > lib/Target/R600/R600ISelLowering.cpp       |  62 +++++
> > lib/Target/R600/R600ISelLowering.h         |   3 +
> > lib/Target/R600/R600InstrInfo.cpp          |  69 +++++-
> > lib/Target/R600/R600InstrInfo.h            |  14 ++
> > lib/Target/R600/R600Instructions.td        |  54 ++++
> > lib/Target/R600/R600RegisterInfo.td        |  48 +++-
> > lib/Target/R600/SIInstructions.td          |   4 +-
> > test/CodeGen/R600/array-ptr-calc-i32.ll    |   7 +-
> > test/CodeGen/R600/indirect-private-64.ll   |  36 +--
> > test/CodeGen/R600/parallelandifcollapse.ll |   6 +
> > test/CodeGen/R600/parallelorifcollapse.ll  |   5 +
> > test/CodeGen/R600/private-memory.ll        |  38 +--
> > test/CodeGen/R600/vector-alloca.ll         |  74 ++++++
> > 23 files changed, 790 insertions(+), 68 deletions(-)
> > create mode 100644 lib/Target/R600/AMDGPUPromoteAlloca.cpp
> > create mode 100644 test/CodeGen/R600/vector-alloca.ll
> > 
> > diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
> > index 5d0cf81..826cc97 100644
> > --- a/lib/Target/R600/AMDGPU.h
> > +++ b/lib/Target/R600/AMDGPU.h
> > @@ -17,6 +17,7 @@
> > namespace llvm {
> > 
> > class AMDGPUInstrPrinter;
> > +class AMDGPUSubtarget;
> > class AMDGPUTargetMachine;
> > class FunctionPass;
> > class MCAsmInfo;
> > @@ -47,6 +48,7 @@ void initializeSILowerI1CopiesPass(PassRegistry &);
> > extern char &SILowerI1CopiesID;
> > 
> > // Passes common to R600 and SI
> > +FunctionPass *createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST);
> > Pass *createAMDGPUStructurizeCFGPass();
> > FunctionPass *createAMDGPUConvertToISAPass(TargetMachine &tm);
> > FunctionPass *createAMDGPUISelDag(TargetMachine &tm);
> > diff --git a/lib/Target/R600/AMDGPU.td b/lib/Target/R600/AMDGPU.td
> > index d1e2cf5..79a0aaa 100644
> > --- a/lib/Target/R600/AMDGPU.td
> > +++ b/lib/Target/R600/AMDGPU.td
> > @@ -87,28 +87,40 @@ def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
> > def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
> > def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
> > 
> > +class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
> > +        "localmemorysize"#Value,
> > +        "LocalMemorySize",
> > +        !cast<string>(Value),
> > +        "The size of local memory in bytes">;
> > +
> > class SubtargetFeatureGeneration <string Value,
> >                                   list<SubtargetFeature> Implies> :
> >         SubtargetFeature <Value, "Gen", "AMDGPUSubtarget::"#Value,
> >                           Value#" GPU generation", Implies>;
> > 
> > +def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
> > +def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
> > +def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
> > +
> > def FeatureR600 : SubtargetFeatureGeneration<"R600",
> > -        [FeatureR600ALUInst, FeatureFetchLimit8]>;
> > +        [FeatureR600ALUInst, FeatureFetchLimit8, FeatureLocalMemorySize0]>;
> > 
> > def FeatureR700 : SubtargetFeatureGeneration<"R700",
> > -        [FeatureFetchLimit16]>;
> > +        [FeatureFetchLimit16, FeatureLocalMemorySize0]>;
> > 
> > def FeatureEvergreen : SubtargetFeatureGeneration<"EVERGREEN",
> > -        [FeatureFetchLimit16]>;
> > +        [FeatureFetchLimit16, FeatureLocalMemorySize32768]>;
> > 
> > def FeatureNorthernIslands : SubtargetFeatureGeneration<"NORTHERN_ISLANDS",
> > -        [FeatureFetchLimit16, FeatureWavefrontSize64]>;
> > +        [FeatureFetchLimit16, FeatureWavefrontSize64,
> > +         FeatureLocalMemorySize32768]
> > +>;
> > 
> > def FeatureSouthernIslands : SubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
> > -        [Feature64BitPtr, FeatureFP64]>;
> > +        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize32768]>;
> > 
> > def FeatureSeaIslands : SubtargetFeatureGeneration<"SEA_ISLANDS",
> > -        [Feature64BitPtr, FeatureFP64]>;
> > +        [Feature64BitPtr, FeatureFP64, FeatureLocalMemorySize65536]>;
> > //===----------------------------------------------------------------------===//
> > 
> > def AMDGPUInstrInfo : InstrInfo {
> > diff --git a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> > index f1f0bfa..8bac89f 100644
> > --- a/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> > +++ b/lib/Target/R600/AMDGPUISelDAGToDAG.cpp
> > @@ -256,6 +256,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
> >     };
> >     return CurDAG->SelectNodeTo(N, AMDGPU::REG_SEQUENCE, MVT::i64, Args);
> >   }
> > +  case AMDGPUISD::BUILD_VERTICAL_VECTOR:
> >   case ISD::BUILD_VECTOR: {
> >     unsigned RegClassID;
> >     const AMDGPURegisterInfo *TRI =
> > @@ -305,7 +306,12 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
> >       // can't be bundled by our scheduler.
> >       switch(NumVectorElts) {
> >       case 2: RegClassID = AMDGPU::R600_Reg64RegClassID; break;
> > -      case 4: RegClassID = AMDGPU::R600_Reg128RegClassID; break;
> > +      case 4:
> > +        if (Opc == AMDGPUISD::BUILD_VERTICAL_VECTOR)
> > +          RegClassID = AMDGPU::R600_Reg128VerticalRegClassID;
> > +        else
> > +          RegClassID = AMDGPU::R600_Reg128RegClassID;
> > +        break;
> >       default: llvm_unreachable("Do not know how to lower this BUILD_VECTOR");
> >       }
> >     }
> > diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
> > index 2b33586..e602b9d 100644
> > --- a/lib/Target/R600/AMDGPUISelLowering.cpp
> > +++ b/lib/Target/R600/AMDGPUISelLowering.cpp
> > @@ -1405,6 +1405,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
> >   NODE_NAME_CASE(SAMPLEB)
> >   NODE_NAME_CASE(SAMPLED)
> >   NODE_NAME_CASE(SAMPLEL)
> > +  NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
> >   NODE_NAME_CASE(STORE_MSKOR)
> >   NODE_NAME_CASE(TBUFFER_STORE_FORMAT)
> >   }
> > diff --git a/lib/Target/R600/AMDGPUISelLowering.h b/lib/Target/R600/AMDGPUISelLowering.h
> > index 4a2dad3..4b9dbb5 100644
> > --- a/lib/Target/R600/AMDGPUISelLowering.h
> > +++ b/lib/Target/R600/AMDGPUISelLowering.h
> > @@ -197,6 +197,7 @@ enum {
> >   SAMPLEB,
> >   SAMPLED,
> >   SAMPLEL,
> > +  BUILD_VERTICAL_VECTOR,
> 
> Can you add a comment somewhere about what this is for? I’m not sure I get it
> 
> 
> >   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
> >   STORE_MSKOR,
> >   LOAD_CONSTANT,
> > diff --git a/lib/Target/R600/AMDGPUPromoteAlloca.cpp b/lib/Target/R600/AMDGPUPromoteAlloca.cpp
> > new file mode 100644
> > index 0000000..3838a0c
> > --- /dev/null
> > +++ b/lib/Target/R600/AMDGPUPromoteAlloca.cpp
> > @@ -0,0 +1,386 @@
> > +//===-- AMDGPUPromoteAlloca.cpp - Promote Allocas -------------------------===//
> > +//
> > +//                     The LLVM Compiler Infrastructure
> > +//
> > +// This file is distributed under the University of Illinois Open Source
> > +// License. See LICENSE.TXT for details.
> > +//
> > +//===----------------------------------------------------------------------===//
> > +//
> > +// This pass eliminates allocas by either converting them into vectors or
> > +// by migrating them to local address space.
> > +//
> > +//===----------------------------------------------------------------------===//
> > +
> > +#include "AMDGPU.h"
> > +#include "AMDGPUSubtarget.h"
> > +#include "llvm/Analysis/ValueTracking.h"
> > +#include "llvm/IR/IRBuilder.h"
> > +#include "llvm/IR/InstVisitor.h"
> > +#include "llvm/Support/Debug.h"
> > +
> > +#define DEBUG_TYPE "amdgpu-promote-alloca”
> I think DEBUG_TYPE is supposed to come before the includes
> 

All the DEBUG_TYPE macros in LLVM were recently moved to after the
includes.

> > +
> > +using namespace llvm;
> > +
> > +namespace {
> > +
> > +class AMDGPUPromoteAlloca : public FunctionPass,
> > +                       public InstVisitor<AMDGPUPromoteAlloca> {
> > +
> > +  static char ID;
> > +  Module *Mod;
> > +  std::map<AllocaInst*, bool> AllocasToPromote;
> > +  const AMDGPUSubtarget &ST;
> > +  int LocalMemAvailable;
> > +
> > +  Value *privatePtrToLocal(IRBuilder<> &B, Value* Ptr);
> > +
> > +public:
> > +  AMDGPUPromoteAlloca(const AMDGPUSubtarget &st) : FunctionPass(ID), ST(st),
> > +                                                   LocalMemAvailable(0) { }
> > +  virtual bool doInitialization(Module &M);
> > +  virtual bool runOnFunction(Function &F);
> > +  virtual const char *getPassName() const {
> > +    return "AMDGPU Promote Alloca";
> > +  }
> > +  void visitAlloca(AllocaInst &I);
> > +};
> > +
> > +} // End anonymous namespace
> > +
> > +char AMDGPUPromoteAlloca::ID = 0;
> > +
> > +bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
> > +  Mod = &M;
> > +  return false;
> > +}
> > +
> > +bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
> > +
> > +  const FunctionType *FTy = F.getFunctionType();
> > +
> > +  LocalMemAvailable = ST.getLocalMemorySize();
> > +
> > +
> > +  // If the function has any arguments in the local address space, then it's
> > +  // possible these arguments require the entire local memory space, so
> > +  // we cannot use local memory in the pass.
> > +  for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
> > +    const Type *ParamTy = FTy->getParamType(i);
> > +    if (ParamTy->isPointerTy() &&
> > +        ParamTy->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
> > +      LocalMemAvailable = 0;
> > +      DEBUG(dbgs() << "Function has local memory argument.  Promoting to "
> > +                      "local memory disabled.\n");
> > +      break;
> > +    }
> > +  }
> > +
> > +  if (LocalMemAvailable > 0) {
> > +    // Check how much local memory is being used by global objects
> > +    for (Module::global_iterator I = Mod->global_begin(),
> > +                                 E = Mod->global_end(); I != E; ++I) {
> > +      GlobalVariable *GV = I;
> > +      PointerType *GVTy = GV->getType();
> > +      if (GVTy->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
> > +        continue;
> > +      for (Value::use_iterator U = GV->use_begin(),
> > +                               UE = GV->use_end(); U != UE; ++U) {
> > +        Instruction *Use = dyn_cast<Instruction>(*U);
> > +        if (!Use)
> > +          continue;
> > +        if (Use->getParent()->getParent() == &F)
> > +          LocalMemAvailable -=
> > +              Mod->getDataLayout()->getTypeAllocSize(GVTy->getElementType());
> > +      }
> > +    }
> > +  }
> > +
> > +  LocalMemAvailable = std::max(0, LocalMemAvailable);
> > +  DEBUG(dbgs() << LocalMemAvailable << "bytes free in local memory.\n");
> > +
> > +  visit(F);
> > +
> > +  return false;
> > +}
> > +
> > +Value *AMDGPUPromoteAlloca::privatePtrToLocal(IRBuilder<> &B, Value* Ptr) {
> > +  Type *PtrTy = Ptr->getType();
> > +
> > +  if (PtrTy->getPointerAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
> > +    return NULL;
> > +
> > +  Type *NewPtrTy = PointerType::get(PtrTy->getPointerElementType(),
> > +                                    AMDGPUAS::LOCAL_ADDRESS);
> > +
> > +  if (GetUnderlyingObject(Ptr)->getType()->getPointerAddressSpace() !=
> > +      AMDGPUAS::LOCAL_ADDRESS)
> > +    return NULL;
> What if GetUnderlyingObject fails, like if the pointer is coming from a select or a phi? Also is it worth using GetUnderlyingObjects?
> 
> > +
> > +  Value *V = B.CreateAddrSpaceCast(Ptr, NewPtrTy);
> > +  return V;
> 
> I don’t think you want an addrspacecast, it doesn’t make sense to cast between these. I think you want to rewrite the uses using the new type. I think this is what the ValueMapper utility class is supposed to help with
> 
> > +}
> > +
> > +static VectorType *arrayTypeToVecType(const Type *ArrayTy) {
> > +  return VectorType::get(ArrayTy->getArrayElementType(),
> > +                         ArrayTy->getArrayNumElements());
> > +}
> > +
> > +static Value* calculateVectorIndex(Value *Ptr,
> > +                                  std::map<GetElementPtrInst*, Value*> GEPIdx) {
> > +  if (isa<AllocaInst>(Ptr))
> > +    return ConstantInt::get(Type::getInt32Ty(Ptr->getContext()), 0);
> Constant::getNullValue
> 
> > +
> > +  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
> > +  assert(GEP);
> Use cast<>
> 
> > +
> > +  return GEPIdx[GEP];
> > +}
> > +
> > +static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
> > +  // FIXME we only support simple cases
> > +  if (GEP->getNumOperands() != 3)
> > +    return NULL;
> > +
> > +  ConstantInt *I0 = dyn_cast<ConstantInt>(GEP->getOperand(1));
> > +  if (!I0 || !I0->isZero())
> > +    return NULL;
> > +
> > +  return GEP->getOperand(2);
> > +}
> > +
> > +static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
> > +  Type *AllocaTy = Alloca->getAllocatedType();
> > +
> > +  DEBUG(dbgs() << "Alloca Candidate for vectorization \n");
> > +
> > +  // FIXME: There is no reason why we can't support larger arrays, we
> > +  // are just being conservative for now.
> > +  if (!AllocaTy->isArrayTy() ||
> > +      AllocaTy->getArrayElementType()->isVectorTy() ||
> > +      AllocaTy->getArrayNumElements() > 4) {
> > +
> > +    DEBUG(dbgs() << "  Cannot convert type to vector");
> > +    return false;
> > +  }
> > +
> > +  std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
> > +  std::vector<Value*> WorkList;
> > +  for (User *AllocaUser : Alloca->users()) {
> > +    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
> > +    if (!GEP) {
> > +      WorkList.push_back(AllocaUser);
> > +      continue;
> > +    }
> > +
> > +    Value *Index = GEPToVectorIndex(GEP);
> > +
> > +    // If we can't compute a vector index from this GEP, then we can't
> > +    // promote this alloca to vector.
> > +    if (!Index) {
> > +      DEBUG(dbgs() << "  Cannot compute vector index for GEP " << *GEP << "\n");
> > +      return false;
> > +    }
> > +
> > +    GEPVectorIdx[GEP] = Index;
> > +    for (User *GEPUser : AllocaUser->users()) {
> > +      WorkList.push_back(GEPUser);
> > +    }
> > +  }
> > +
> > +  VectorType *VectorTy = arrayTypeToVecType(AllocaTy);
> > +
> > +  DEBUG(dbgs() << "  Converting alloca to vector "; AllocaTy->dump();
> > +        dbgs() << " -> "; VectorTy->dump(); dbgs() << "\n");
> > +
> > +  for (std::vector<Value*>::iterator I = WorkList.begin(),
> > +                                     E = WorkList.end(); I != E; ++I) {
> > +    Instruction *Inst = dyn_cast<Instruction>(*I);
> > +    assert(Inst);
> > +    IRBuilder<> Builder(Inst);
> > +    switch (Inst->getOpcode()) {
> > +    case Instruction::Load: {
> > +      Value *Ptr = Inst->getOperand(0);
> > +      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
> > +      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
> > +      Value *VecValue = Builder.CreateLoad(BitCast);
> > +      Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
> > +      Inst->replaceAllUsesWith(ExtractElement);
> > +      Inst->eraseFromParent();
> > +      break;
> > +    }
> > +    case Instruction::Store: {
> > +      Value *Ptr = Inst->getOperand(1);
> > +      Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
> > +      Value *BitCast = Builder.CreateBitCast(Alloca, VectorTy->getPointerTo(0));
> > +      Value *VecValue = Builder.CreateLoad(BitCast);
> > +      Value *NewVecValue = Builder.CreateInsertElement(VecValue,
> > +                                                       Inst->getOperand(0),
> > +                                                       Index);
> > +      Builder.CreateStore(NewVecValue, BitCast);
> > +      Inst->eraseFromParent();
> > +      break;
> > +    }
> > +    case Instruction::BitCast:
> > +      break;
> > +
> > +    default:
> > +      Inst->dump();
> > +      llvm_unreachable("Do not know how to replace this instruction "
> > +                              "with vector op");
> > +    }
> > +  }
> > +  return true;
> > +}
> > +
> > +static void collectUsesWithPtrTypes(Value *Val, std::vector<Value*> &WorkList) {
> > +  for (User *User : Val->users()) {
> > +    if(std::find(WorkList.begin(), WorkList.end(), User) != WorkList.end())
> > +      continue;
> > +    if (isa<CallInst>(User)) {
> > +      WorkList.push_back(User);
> > +      continue;
> > +    }
> > +    if (!User->getType()->isPointerTy())
> > +      continue;
> > +    WorkList.push_back(User);
> > +    collectUsesWithPtrTypes(User, WorkList);
> > +  }
> > +}
> > +
> > +void AMDGPUPromoteAlloca::visitAlloca(AllocaInst &I) {
> > +  IRBuilder<> Builder(&I);
> > +
> > +  // First try to replace the alloca with a vector
> > +  Type *AllocaTy = I.getAllocatedType();
> > +
> > +  DEBUG(dbgs() << "Trying to promote " << I);
> > +
> > +  if (tryPromoteAllocaToVector(&I))
> > +    return;
> > +
> > +  DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
> > +
> > +  // FIXME: This is the maximum work group size.  We should try to get
> > +  // value from the reqd_work_group_size function attribute if it is
> > +  // available.
> > +  unsigned WorkGroupSize = 256;
> > +  int AllocaSize = WorkGroupSize *
> > +      Mod->getDataLayout()->getTypeAllocSize(AllocaTy);
> > +
> > +  if (AllocaSize > LocalMemAvailable) {
> > +    DEBUG(dbgs() << " Not enough local memory to promote alloca.\n");
> > +    return;
> > +  }
> > +
> > +  DEBUG(dbgs() << "Promoting alloca to local memory\n");
> > +  LocalMemAvailable -= AllocaSize;
> > +
> > +  GlobalVariable *GV = new GlobalVariable(
> > +      *Mod, ArrayType::get(I.getAllocatedType(), 256), false,
> > +      GlobalValue::ExternalLinkage, 0, I.getName(), 0,
> > +      GlobalVariable::NotThreadLocal, AMDGPUAS::LOCAL_ADDRESS);
> > +
> > +  FunctionType *FTy = FunctionType::get(
> > +      Type::getInt32Ty(Mod->getContext()), false);
> > +  AttributeSet AttrSet;
> > +  AttrSet.addAttribute(Mod->getContext(), 0, Attribute::ReadNone);
> > +
> > +  Value *ReadLocalSizeY = Mod->getOrInsertFunction(
> > +      "llvm.r600.read.local.size.y", FTy, AttrSet);
> > +  Value *ReadLocalSizeZ = Mod->getOrInsertFunction(
> > +      "llvm.r600.read.local.size.z", FTy, AttrSet);
> > +  Value *ReadTIDIGX = Mod->getOrInsertFunction(
> > +      "llvm.r600.read.tidig.x", FTy, AttrSet);
> > +  Value *ReadTIDIGY = Mod->getOrInsertFunction(
> > +      "llvm.r600.read.tidig.y", FTy, AttrSet);
> > +  Value *ReadTIDIGZ = Mod->getOrInsertFunction(
> > +      "llvm.r600.read.tidig.z", FTy, AttrSet);
> > +
> > +
> > +  Value *TCntY = Builder.CreateCall(ReadLocalSizeY);
> > +  Value *TCntZ = Builder.CreateCall(ReadLocalSizeZ);
> > +  Value *TIdX  = Builder.CreateCall(ReadTIDIGX);
> > +  Value *TIdY  = Builder.CreateCall(ReadTIDIGY);
> > +  Value *TIdZ  = Builder.CreateCall(ReadTIDIGZ);
> > +
> > +  Value *Tmp0 = Builder.CreateMul(TCntY, TCntZ);
> > +  Tmp0 = Builder.CreateMul(Tmp0, TIdX);
> > +  Value *Tmp1 = Builder.CreateMul(TIdY, TCntZ);
> > +  Value *TID = Builder.CreateAdd(Tmp0, Tmp1);
> > +  TID = Builder.CreateAdd(TID, TIdZ);
> > +
> > +  std::vector<Value*> Indices;
> > +  Indices.push_back(ConstantInt::get(Type::getInt32Ty(Mod->getContext()), 0));
> Constant::getNullValue
> 
> > +  Indices.push_back(TID);
> > +
> > +  Value *Offset = Builder.CreateGEP(GV, Indices);
> > +  I.mutateType(Offset->getType());
> > +  I.replaceAllUsesWith(Offset);
> > +  I.eraseFromParent();
> > +
> > +  std::vector<Value*> WorkList;
> > +
> > +  collectUsesWithPtrTypes(Offset, WorkList);
> > +
> > +  for (std::vector<Value*>::iterator i = WorkList.begin(),
> > +                                     e = WorkList.end(); i != e; ++i) {
> > +    Value *V = *i;
> > +    CallInst *Call = dyn_cast<CallInst>(V);
> > +    if (Call) {
> > +      IntrinsicInst *Intr = dyn_cast<IntrinsicInst>(Call);
> > +      if (Intr) {
> 
> It would probably help reduce indentation to convert these into !X continues
> 
> > +        Builder.SetInsertPoint(Intr);
> > +        switch (Intr->getIntrinsicID()) {
> > +        case Intrinsic::lifetime_start:
> > +        case Intrinsic::lifetime_end:
> > +          // These intrinsics are for address space 0 only
> > +          Intr->eraseFromParent();
> > +          continue;
> > +        case Intrinsic::memcpy: {
> > +          MemCpyInst *MemCpy = cast<MemCpyInst>(Intr);
> > +          Builder.CreateMemCpy(MemCpy->getRawDest(), MemCpy->getRawSource(),
> > +                               MemCpy->getLength(), MemCpy->getAlignment(),
> > +                               MemCpy->isVolatile());
> > +          Intr->eraseFromParent();
> > +          continue;
> > +        }
> > +        case Intrinsic::memset: {
> > +          MemSetInst *MemSet = cast<MemSetInst>(Intr);
> > +          Builder.CreateMemSet(MemSet->getRawDest(), MemSet->getValue(),
> > +                               MemSet->getLength(), MemSet->getAlignment(),
> > +                               MemSet->isVolatile());
> > +          Intr->eraseFromParent();
> > +          continue;
> > +        }
> > +        default:
> > +          Intr->dump();
> > +          llvm_unreachable("Don't know how to promote alloca intrinsic use.");
> > +        }
> > +      }
> > +      std::vector<Type*> ArgTypes;
> > +      for (unsigned ArgIdx = 0, ArgEnd = Call->getNumArgOperands();
> > +                                ArgIdx != ArgEnd; ++ArgIdx) {
> > +        ArgTypes.push_back(Call->getArgOperand(ArgIdx)->getType());
> > +      }
> > +      Function *F = Call->getCalledFunction();
> > +      FunctionType *NewType = FunctionType::get(Call->getType(), ArgTypes,
> > +                                                F->isVarArg());
> > +      Constant *C = Mod->getOrInsertFunction(StringRef(F->getName().str() + ".local"), NewType,
> > +                                                F->getAttributes());
> > +      Function *NewF = dyn_cast<Function>(C);
> > +      assert(NewF);
> cast<>
> 
> > +      Call->setCalledFunction(NewF);
> > +      continue;
> > +    }
> > +    Type *EltTy = V->getType()->getPointerElementType();
> > +    PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
> > +    V->mutateType(NewTy);
> > +
> > +  }
> > +}
> > +
> > +FunctionPass *llvm::createAMDGPUPromoteAlloca(const AMDGPUSubtarget &ST) {
> > +  return new AMDGPUPromoteAlloca(ST);
> > +}
> > diff --git a/lib/Target/R600/AMDGPUSubtarget.cpp b/lib/Target/R600/AMDGPUSubtarget.cpp
> > index f3b9932..29ab8f8 100644
> > --- a/lib/Target/R600/AMDGPUSubtarget.cpp
> > +++ b/lib/Target/R600/AMDGPUSubtarget.cpp
> > @@ -39,6 +39,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(StringRef TT, StringRef CPU, StringRef FS) :
> >   EnableIfCvt = true;
> >   WavefrontSize = 0;
> >   CFALUBug = false;
> > +  LocalMemorySize = 0;
> >   ParseSubtargetFeatures(GPU, FS);
> >   DevName = GPU;
> > }
> > @@ -101,6 +102,10 @@ AMDGPUSubtarget::hasCFAluBug() const {
> >   assert(getGeneration() <= NORTHERN_ISLANDS);
> >   return CFALUBug;
> > }
> > +int
> > +AMDGPUSubtarget::getLocalMemorySize() const {
> > +  return LocalMemorySize;
> > +}
> 
> Return type on separate line
> 

The whole file is like this.  I can update all the functions in a follow
on patch.

The attached patch should address the rest of your comments.

-Tom

> > bool
> > AMDGPUSubtarget::isTargetELF() const {
> >   return false;
> > diff --git a/lib/Target/R600/AMDGPUSubtarget.h b/lib/Target/R600/AMDGPUSubtarget.h
> > index 1b041d6..f19af91 100644
> > --- a/lib/Target/R600/AMDGPUSubtarget.h
> > +++ b/lib/Target/R600/AMDGPUSubtarget.h
> > @@ -52,6 +52,7 @@ private:
> >   bool EnableIfCvt;
> >   unsigned WavefrontSize;
> >   bool CFALUBug;
> > +  unsigned LocalMemorySize;
> > 
> >   InstrItineraryData InstrItins;
> > 
> > @@ -90,6 +91,7 @@ public:
> >   unsigned getWavefrontSize() const;
> >   unsigned getStackEntrySize() const;
> >   bool hasCFAluBug() const;
> > +  int getLocalMemorySize() const;
> > 
> >   bool enableMachineScheduler() const override {
> >     return getGeneration() <= NORTHERN_ISLANDS;
> > diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
> > index 6b68c2a..7b4801f 100644
> > --- a/lib/Target/R600/AMDGPUTargetMachine.cpp
> > +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
> > @@ -111,6 +111,7 @@ public:
> >     return nullptr;
> >   }
> > 
> > +  virtual void addCodeGenPrepare();
> >   bool addPreISel() override;
> >   bool addInstSelector() override;
> >   bool addPreRegAlloc() override;
> > @@ -136,6 +137,13 @@ void AMDGPUTargetMachine::addAnalysisPasses(PassManagerBase &PM) {
> >   PM.add(createAMDGPUTargetTransformInfoPass(this));
> > }
> > 
> > +void AMDGPUPassConfig::addCodeGenPrepare() {
> > +  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
> > +  addPass(createAMDGPUPromoteAlloca(ST));
> > +  addPass(createSROAPass());
> > +  TargetPassConfig::addCodeGenPrepare();
> > +}
> > +
> > bool
> > AMDGPUPassConfig::addPreISel() {
> >   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
> > diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
> > index 3c6fa5a..7a2de0b 100644
> > --- a/lib/Target/R600/CMakeLists.txt
> > +++ b/lib/Target/R600/CMakeLists.txt
> > @@ -26,6 +26,7 @@ add_llvm_target(R600CodeGen
> >   AMDGPUISelLowering.cpp
> >   AMDGPUConvertToISA.cpp
> >   AMDGPUInstrInfo.cpp
> > +  AMDGPUPromoteAlloca.cpp
> >   AMDGPURegisterInfo.cpp
> >   R600ClauseMergePass.cpp
> >   R600ControlFlowFinalizer.cpp
> > diff --git a/lib/Target/R600/R600ISelLowering.cpp b/lib/Target/R600/R600ISelLowering.cpp
> > index e3bcab0..85fc03d 100644
> > --- a/lib/Target/R600/R600ISelLowering.cpp
> > +++ b/lib/Target/R600/R600ISelLowering.cpp
> > @@ -133,6 +133,16 @@ R600TargetLowering::R600TargetLowering(TargetMachine &TM) :
> >   setOperationAction(ISD::LOAD, MVT::v4i32, Custom);
> >   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
> > 
> > +  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i32, Custom);
> > +  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f32, Custom);
> > +  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4i32, Custom);
> > +  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
> > +
> > +  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2i32, Custom);
> > +  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v2f32, Custom);
> > +  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4i32, Custom);
> > +  setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v4f32, Custom);
> > +
> >   setTargetDAGCombine(ISD::FP_ROUND);
> >   setTargetDAGCombine(ISD::FP_TO_SINT);
> >   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
> > @@ -537,6 +547,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
> >   R600MachineFunctionInfo *MFI = MF.getInfo<R600MachineFunctionInfo>();
> >   switch (Op.getOpcode()) {
> >   default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
> > +  case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
> > +  case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
> >   case ISD::FCOS:
> >   case ISD::FSIN: return LowerTrig(Op, DAG);
> >   case ISD::SELECT_CC: return LowerSELECT_CC(Op, DAG);
> > @@ -809,6 +821,56 @@ void R600TargetLowering::ReplaceNodeResults(SDNode *N,
> >   }
> > }
> > 
> > +SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
> > +                                                   SDValue Vector) const {
> > +
> > +  SDLoc DL(Vector);
> > +  EVT VecVT = Vector.getValueType();
> > +  EVT EltVT = VecVT.getVectorElementType();
> > +  SmallVector<SDValue, 8> Args;
> > +
> > +  for (unsigned i = 0, e = VecVT.getVectorNumElements();
> > +                                                           i != e; ++i) {
> > +    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
> > +                               Vector, DAG.getConstant(i, getVectorIdxTy())));
> > +  }
> > +
> > +  return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
> > +}
> > +
> > +SDValue R600TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
> > +                                                    SelectionDAG &DAG) const {
> > +
> > +  SDLoc DL(Op);
> > +  SDValue Vector = Op.getOperand(0);
> > +  SDValue Index = Op.getOperand(1);
> > +
> > +  if (isa<ConstantSDNode>(Index) ||
> > +      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
> > +    return Op;
> > +
> > +  Vector = vectorToVerticalVector(DAG, Vector);
> > +  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Op.getValueType(),
> > +                     Vector, Index);
> > +}
> > +
> > +SDValue R600TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
> > +                                                   SelectionDAG &DAG) const {
> > +  SDLoc DL(Op);
> > +  SDValue Vector = Op.getOperand(0);
> > +  SDValue Value = Op.getOperand(1);
> > +  SDValue Index = Op.getOperand(2);
> > +
> > +  if (isa<ConstantSDNode>(Index) ||
> > +      Vector.getOpcode() == AMDGPUISD::BUILD_VERTICAL_VECTOR)
> > +    return Op;
> > +
> > +  Vector = vectorToVerticalVector(DAG, Vector);
> > +  SDValue Insert = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, Op.getValueType(),
> > +                               Vector, Value, Index);
> > +  return vectorToVerticalVector(DAG, Insert);
> > +}
> > +
> > SDValue R600TargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
> >   // On hw >= R700, COS/SIN input must be between -1. and 1.
> >   // Thus we lower them to TRIG ( FRACT ( x / 2Pi + 0.5) - 0.5)
> > diff --git a/lib/Target/R600/R600ISelLowering.h b/lib/Target/R600/R600ISelLowering.h
> > index a8a464f..c568384 100644
> > --- a/lib/Target/R600/R600ISelLowering.h
> > +++ b/lib/Target/R600/R600ISelLowering.h
> > @@ -51,10 +51,13 @@ private:
> >   void lowerImplicitParameter(MachineInstr *MI, MachineBasicBlock &BB,
> >       MachineRegisterInfo & MRI, unsigned dword_offset) const;
> >   SDValue OptimizeSwizzle(SDValue BuildVector, SDValue Swz[], SelectionDAG &DAG) const;
> > +  SDValue vectorToVerticalVector(SelectionDAG &DAG, SDValue Vector) const;
> > 
> >   /// \brief Lower ROTL opcode to BITALIGN
> >   SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
> > 
> > +  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
> > +  SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
> >   SDValue LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const;
> >   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
> >   SDValue LowerFPTOUINT(SDValue Op, SelectionDAG &DAG) const;
> > diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
> > index b0d9ae3..cd89e59 100644
> > --- a/lib/Target/R600/R600InstrInfo.cpp
> > +++ b/lib/Target/R600/R600InstrInfo.cpp
> > @@ -52,11 +52,15 @@ R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
> >                            unsigned DestReg, unsigned SrcReg,
> >                            bool KillSrc) const {
> >   unsigned VectorComponents = 0;
> > -  if (AMDGPU::R600_Reg128RegClass.contains(DestReg) &&
> > -      AMDGPU::R600_Reg128RegClass.contains(SrcReg)) {
> > +  if ((AMDGPU::R600_Reg128RegClass.contains(DestReg) ||
> > +      AMDGPU::R600_Reg128VerticalRegClass.contains(DestReg)) &&
> > +      (AMDGPU::R600_Reg128RegClass.contains(SrcReg) ||
> > +       AMDGPU::R600_Reg128VerticalRegClass.contains(SrcReg))) {
> >     VectorComponents = 4;
> > -  } else if(AMDGPU::R600_Reg64RegClass.contains(DestReg) &&
> > -            AMDGPU::R600_Reg64RegClass.contains(SrcReg)) {
> > +  } else if((AMDGPU::R600_Reg64RegClass.contains(DestReg) ||
> > +            AMDGPU::R600_Reg64VerticalRegClass.contains(DestReg)) &&
> > +            (AMDGPU::R600_Reg64RegClass.contains(SrcReg) ||
> > +             AMDGPU::R600_Reg64VerticalRegClass.contains(SrcReg))) {
> >     VectorComponents = 2;
> >   }
> > 
> > @@ -1064,6 +1068,29 @@ unsigned int R600InstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
> >   return 2;
> > }
> > 
> > +bool R600InstrInfo::expandPostRAPseudo(MachineBasicBlock::iterator MI) const {
> > +
> > +  switch(MI->getOpcode()) {
> > +  default: return AMDGPUInstrInfo::expandPostRAPseudo(MI);
> > +  case AMDGPU::R600_EXTRACT_ELT_V2:
> > +  case AMDGPU::R600_EXTRACT_ELT_V4:
> > +    buildIndirectRead(MI->getParent(), MI, MI->getOperand(0).getReg(),
> > +                      RI.getHWRegIndex(MI->getOperand(1).getReg()), //  Address
> > +                      MI->getOperand(2).getReg(),
> > +                      RI.getHWRegChan(MI->getOperand(1).getReg()));
> > +    break;
> > +  case AMDGPU::R600_INSERT_ELT_V2:
> > +  case AMDGPU::R600_INSERT_ELT_V4:
> > +    buildIndirectWrite(MI->getParent(), MI, MI->getOperand(2).getReg(), // Value
> > +                       RI.getHWRegIndex(MI->getOperand(1).getReg()),  // Address
> > +                       MI->getOperand(3).getReg(),                    // Offset
> > +                       RI.getHWRegChan(MI->getOperand(1).getReg()));  // Channel
> > +    break;
> > +  }
> > +  MI->eraseFromParent();
> > +  return true;
> > +}
> > +
> > void  R600InstrInfo::reserveIndirectRegisters(BitVector &Reserved,
> >                                              const MachineFunction &MF) const {
> >   const AMDGPUFrameLowering *TFL =
> > @@ -1100,7 +1127,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
> >                                        MachineBasicBlock::iterator I,
> >                                        unsigned ValueReg, unsigned Address,
> >                                        unsigned OffsetReg) const {
> > -  unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
> > +  return buildIndirectWrite(MBB, I, ValueReg, Address, OffsetReg, 0);
> > +}
> > +
> > +MachineInstrBuilder R600InstrInfo::buildIndirectWrite(MachineBasicBlock *MBB,
> > +                                       MachineBasicBlock::iterator I,
> > +                                       unsigned ValueReg, unsigned Address,
> > +                                       unsigned OffsetReg,
> > +                                       unsigned AddrChan) const {
> > +  unsigned AddrReg;
> > +  switch (AddrChan) {
> > +    default: llvm_unreachable("Invalid Channel");
> > +    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
> > +    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
> > +    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
> > +    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
> > +  }
> >   MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
> >                                                AMDGPU::AR_X, OffsetReg);
> >   setImmOperand(MOVA, AMDGPU::OpName::write, 0);
> > @@ -1117,7 +1159,22 @@ MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
> >                                        MachineBasicBlock::iterator I,
> >                                        unsigned ValueReg, unsigned Address,
> >                                        unsigned OffsetReg) const {
> > -  unsigned AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address);
> > +  return buildIndirectRead(MBB, I, ValueReg, Address, OffsetReg, 0);
> > +}
> > +
> > +MachineInstrBuilder R600InstrInfo::buildIndirectRead(MachineBasicBlock *MBB,
> > +                                       MachineBasicBlock::iterator I,
> > +                                       unsigned ValueReg, unsigned Address,
> > +                                       unsigned OffsetReg,
> > +                                       unsigned AddrChan) const {
> > +  unsigned AddrReg;
> > +  switch (AddrChan) {
> > +    default: llvm_unreachable("Invalid Channel");
> > +    case 0: AddrReg = AMDGPU::R600_AddrRegClass.getRegister(Address); break;
> > +    case 1: AddrReg = AMDGPU::R600_Addr_YRegClass.getRegister(Address); break;
> > +    case 2: AddrReg = AMDGPU::R600_Addr_ZRegClass.getRegister(Address); break;
> > +    case 3: AddrReg = AMDGPU::R600_Addr_WRegClass.getRegister(Address); break;
> > +  }
> >   MachineInstr *MOVA = buildDefaultInstruction(*MBB, I, AMDGPU::MOVA_INT_eg,
> >                                                        AMDGPU::AR_X,
> >                                                        OffsetReg);
> > diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
> > index b5304a0..f1e6fd3 100644
> > --- a/lib/Target/R600/R600InstrInfo.h
> > +++ b/lib/Target/R600/R600InstrInfo.h
> > @@ -38,6 +38,18 @@ namespace llvm {
> >   std::vector<std::pair<int, unsigned> >
> >   ExtractSrcs(MachineInstr *MI, const DenseMap<unsigned, unsigned> &PV, unsigned &ConstCount) const;
> > 
> > +
> > +  MachineInstrBuilder buildIndirectRead(MachineBasicBlock *MBB,
> > +                                        MachineBasicBlock::iterator I,
> > +                                        unsigned ValueReg, unsigned Address,
> > +                                        unsigned OffsetReg,
> > +                                        unsigned AddrChan) const;
> > +
> > +  MachineInstrBuilder buildIndirectWrite(MachineBasicBlock *MBB,
> > +                                        MachineBasicBlock::iterator I,
> > +                                        unsigned ValueReg, unsigned Address,
> > +                                        unsigned OffsetReg,
> > +                                        unsigned AddrChan) const;
> >   public:
> >   enum BankSwizzle {
> >     ALU_VEC_012_SCL_210 = 0,
> > @@ -197,6 +209,8 @@ namespace llvm {
> >   int getInstrLatency(const InstrItineraryData *ItinData,
> >                       SDNode *Node) const override { return 1;}
> > 
> > +  virtual bool expandPostRAPseudo(MachineBasicBlock::iterator MI) const;
> > +
> >   /// \brief Reserve the registers that may be accesed using indirect addressing.
> >   void reserveIndirectRegisters(BitVector &Reserved,
> >                                 const MachineFunction &MF) const;
> > diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
> > index d2075c0..8279a89 100644
> > --- a/lib/Target/R600/R600Instructions.td
> > +++ b/lib/Target/R600/R600Instructions.td
> > @@ -1548,6 +1548,60 @@ let isTerminator=1 in {
> > }
> > 
> > //===----------------------------------------------------------------------===//
> > +// Indirect addressing pseudo instructions
> > +//===----------------------------------------------------------------------===//
> > +
> > +let isPseudo = 1 in {
> > +
> > +class ExtractVertical <RegisterClass vec_rc> : InstR600 <
> > +  (outs R600_Reg32:$dst),
> > +  (ins vec_rc:$vec, R600_Reg32:$index), "",
> > +  [],
> > +  AnyALU
> > +>;
> > +
> > +let Constraints = "$dst = $vec" in {
> > +
> > +class InsertVertical <RegisterClass vec_rc> : InstR600 <
> > +  (outs vec_rc:$dst),
> > +  (ins vec_rc:$vec, R600_Reg32:$value, R600_Reg32:$index), "",
> > +  [],
> > +  AnyALU
> > +>;
> > +
> > +} // End Constraints = "$dst = $vec"
> > +
> > +} // End isPseudo = 1
> > +
> > +def R600_EXTRACT_ELT_V2 : ExtractVertical <R600_Reg64Vertical>;
> > +def R600_EXTRACT_ELT_V4 : ExtractVertical <R600_Reg128Vertical>;
> > +
> > +def R600_INSERT_ELT_V2 : InsertVertical <R600_Reg64Vertical>;
> > +def R600_INSERT_ELT_V4 : InsertVertical <R600_Reg128Vertical>;
> > +
> > +class ExtractVerticalPat <Instruction inst, ValueType vec_ty,
> > +                          ValueType scalar_ty> : Pat <
> > +  (scalar_ty (extractelt vec_ty:$vec, i32:$index)),
> > +  (inst $vec, $index)
> > +>;
> > +
> > +def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2i32, i32>;
> > +def : ExtractVerticalPat <R600_EXTRACT_ELT_V2, v2f32, f32>;
> > +def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4i32, i32>;
> > +def : ExtractVerticalPat <R600_EXTRACT_ELT_V4, v4f32, f32>;
> > +
> > +class InsertVerticalPat <Instruction inst, ValueType vec_ty,
> > +                         ValueType scalar_ty> : Pat <
> > +  (vec_ty (insertelt vec_ty:$vec, scalar_ty:$value, i32:$index)),
> > +  (inst $vec, $value, $index)
> > +>;
> > +
> > +def : InsertVerticalPat <R600_INSERT_ELT_V2, v2i32, i32>;
> > +def : InsertVerticalPat <R600_INSERT_ELT_V2, v2f32, f32>;
> > +def : InsertVerticalPat <R600_INSERT_ELT_V4, v4i32, i32>;
> > +def : InsertVerticalPat <R600_INSERT_ELT_V4, v4f32, f32>;
> > +
> > +//===----------------------------------------------------------------------===//
> > // ISel Patterns
> > //===----------------------------------------------------------------------===//
> > 
> > diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
> > index 68bcd20..cc667d9 100644
> > --- a/lib/Target/R600/R600RegisterInfo.td
> > +++ b/lib/Target/R600/R600RegisterInfo.td
> > @@ -18,18 +18,28 @@ class R600RegWithChan <string name, bits<9> sel, string chan> :
> > 
> > class R600Reg_128<string n, list<Register> subregs, bits<16> encoding> :
> >     RegisterWithSubRegs<n, subregs> {
> > +  field bits<2> chan_encoding = 0;
> >   let Namespace = "AMDGPU";
> >   let SubRegIndices = [sub0, sub1, sub2, sub3];
> > -  let HWEncoding = encoding;
> > +  let HWEncoding{8-0} = encoding{8-0};
> > +  let HWEncoding{10-9} = chan_encoding;
> > }
> > 
> > class R600Reg_64<string n, list<Register> subregs, bits<16> encoding> :
> >     RegisterWithSubRegs<n, subregs> {
> > +  field bits<2> chan_encoding = 0;
> >   let Namespace = "AMDGPU";
> >   let SubRegIndices = [sub0, sub1];
> >   let HWEncoding = encoding;
> > +  let HWEncoding{8-0} = encoding{8-0};
> > +  let HWEncoding{10-9} = chan_encoding;
> > }
> > 
> > +class R600Reg_64Vertical<int lo, int hi, string chan> : R600Reg_64 <
> > +  "V"#lo#hi#"_"#chan,
> > +  [!cast<Register>("T"#lo#"_"#chan), !cast<Register>("T"#hi#"_"#chan)],
> > +  lo
> > +>;
> > 
> > foreach Index = 0-127 in {
> >   foreach Chan = [ "X", "Y", "Z", "W" ] in {
> > @@ -54,6 +64,24 @@ foreach Index = 0-127 in {
> >                                    Index>;
> > }
> > 
> > +foreach Chan = [ "X", "Y", "Z", "W"] in {
> > +
> > +  let chan_encoding = !if(!eq(Chan, "X"), 0,
> > +                      !if(!eq(Chan, "Y"), 1,
> > +                      !if(!eq(Chan, "Z"), 2,
> > +                      !if(!eq(Chan, "W"), 3, 0)))) in {
> > +    def V0123_#Chan : R600Reg_128 <"V0123_"#Chan,
> > +                                   [!cast<Register>("T0_"#Chan),
> > +                                    !cast<Register>("T1_"#Chan),
> > +                                    !cast<Register>("T2_"#Chan),
> > +                                    !cast<Register>("T3_"#Chan)],
> > +                                    0>;
> > +    def V01_#Chan : R600Reg_64Vertical<0, 1, Chan>;
> > +    def V23_#Chan : R600Reg_64Vertical<2, 3, Chan>;
> > +  }
> > +}
> > +
> > +
> > // KCACHE_BANK0
> > foreach Index = 159-128 in {
> >   foreach Chan = [ "X", "Y", "Z", "W" ] in {
> > @@ -130,8 +158,14 @@ def ALU_PARAM : R600Reg<"Param", 0>;
> > 
> > let isAllocatable = 0 in {
> > 
> > -// XXX: Only use the X channel, until we support wider stack widths
> > -def R600_Addr : RegisterClass <"AMDGPU", [i32], 127, (add (sequence "Addr%u_X", 0, 127))>;
> > +def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>;
> > +
> > +// We only use Addr_[YZW] for vertical vectors.
> > +// FIXME if we add more vertical vector registers we will need to ad more
> > +// registers to these classes.
> > +def R600_Addr_Y : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Y)>;
> > +def R600_Addr_Z : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_Z)>;
> > +def R600_Addr_W : RegisterClass <"AMDGPU", [i32], 32, (add Addr0_W)>;
> > 
> > def R600_LDS_SRC_REG : RegisterClass<"AMDGPU", [i32], 32,
> >   (add OQA, OQB, OQAP, OQBP, LDS_DIRECT_A, LDS_DIRECT_B)>;
> > @@ -206,5 +240,13 @@ def R600_Reg128 : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
> >   let CopyCost = -1;
> > }
> > 
> > +def R600_Reg128Vertical : RegisterClass<"AMDGPU", [v4f32, v4i32], 128,
> > +  (add V0123_W, V0123_Z, V0123_Y, V0123_X)
> > +>;
> > +
> > def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
> >                                 (add (sequence "T%u_XY", 0, 63))>;
> > +
> > +def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
> > +                                      (add V01_X, V01_Y, V01_Z, V01_W,
> > +                                           V23_X, V23_Y, V23_Z, V23_W)>;
> > diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
> > index acd4089..f96c8a8 100644
> > --- a/lib/Target/R600/SIInstructions.td
> > +++ b/lib/Target/R600/SIInstructions.td
> > @@ -2418,13 +2418,13 @@ multiclass SI_INDIRECT_Pattern <ValueType vt, ValueType eltvt, SI_INDIRECT_DST I
> >   // 1. Extract with offset
> >   def : Pat<
> >     (vector_extract vt:$vec, (add i32:$idx, imm:$off)),
> > -    (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off))
> > +    (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, imm:$off))
> >> ;
> > 
> >   // 2. Extract without offset
> >   def : Pat<
> >     (vector_extract vt:$vec, i32:$idx),
> > -    (f32 (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0))
> > +    (eltvt (SI_INDIRECT_SRC (IMPLICIT_DEF), $vec, $idx, 0))
> >> ;
> > 
> >   // 3. Insert with offset
> > diff --git a/test/CodeGen/R600/array-ptr-calc-i32.ll b/test/CodeGen/R600/array-ptr-calc-i32.ll
> > index c2362da..3230353 100644
> > --- a/test/CodeGen/R600/array-ptr-calc-i32.ll
> > +++ b/test/CodeGen/R600/array-ptr-calc-i32.ll
> > @@ -10,7 +10,12 @@ declare void @llvm.AMDGPU.barrier.local() nounwind noduplicate
> > 
> > ; SI-LABEL: @test_private_array_ptr_calc:
> > ; SI: V_ADD_I32_e32 [[PTRREG:v[0-9]+]]
> > -; SI: V_MOVRELD_B32_e32 {{v[0-9]+}}, [[PTRREG]]
> > +;
> > +; FIXME: The AMDGPUPromoteAlloca pass should be able to convert this
> > +; alloca to a vector.  It currently fails because it does not know how
> > +; to interpret:
> > +; getelementptr [4 x i32]* %alloca, i32 1, i32 %b
> > +; SI: DS_WRITE_B32 {{v[0-9]+}}, [[PTRREG]]
> > define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
> >   %alloca = alloca [4 x i32], i32 4, align 16
> >   %tid = call i32 @llvm.SI.tid() readnone
> > diff --git a/test/CodeGen/R600/indirect-private-64.ll b/test/CodeGen/R600/indirect-private-64.ll
> > index 4d1f734..b127b7e 100644
> > --- a/test/CodeGen/R600/indirect-private-64.ll
> > +++ b/test/CodeGen/R600/indirect-private-64.ll
> > @@ -3,10 +3,8 @@
> > declare void @llvm.AMDGPU.barrier.local() noduplicate nounwind
> > 
> > ; SI-LABEL: @private_access_f64_alloca:
> > -; SI: V_MOVRELD_B32_e32
> > -; SI: V_MOVRELD_B32_e32
> > -; SI: V_MOVRELS_B32_e32
> > -; SI: V_MOVRELS_B32_e32
> > +; SI: DS_WRITE_B64
> > +; SI: DS_READ_B64
> > define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) nounwind {
> >   %val = load double addrspace(1)* %in, align 8
> >   %array = alloca double, i32 16, align 8
> > @@ -19,14 +17,10 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
> > }
> > 
> > ; SI-LABEL: @private_access_v2f64_alloca:
> > -; SI: V_MOVRELD_B32_e32
> > -; SI: V_MOVRELD_B32_e32
> > -; SI: V_MOVRELD_B32_e32
> > -; SI: V_MOVRELD_B32_e32
> > -; SI: V_MOVRELS_B32_e32
> > -; SI: V_MOVRELS_B32_e32
> > -; SI: V_MOVRELS_B32_e32
> > -; SI: V_MOVRELS_B32_e32
> > +; SI: DS_WRITE_B64
> > +; SI: DS_WRITE_B64
> > +; SI: DS_READ_B64
> > +; SI: DS_READ_B64
> > define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) nounwind {
> >   %val = load <2 x double> addrspace(1)* %in, align 16
> >   %array = alloca <2 x double>, i32 16, align 16
> > @@ -39,10 +33,8 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out
> > }
> > 
> > ; SI-LABEL: @private_access_i64_alloca:
> > -; SI: V_MOVRELD_B32_e32
> > -; SI: V_MOVRELD_B32_e32
> > -; SI: V_MOVRELS_B32_e32
> > -; SI: V_MOVRELS_B32_e32
> > +; SI: DS_WRITE_B64
> > +; SI: DS_READ_B64
> > define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) nounwind {
> >   %val = load i64 addrspace(1)* %in, align 8
> >   %array = alloca i64, i32 16, align 8
> > @@ -55,14 +47,10 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
> > }
> > 
> > ; SI-LABEL: @private_access_v2i64_alloca:
> > -; SI: V_MOVRELD_B32_e32
> > -; SI: V_MOVRELD_B32_e32
> > -; SI: V_MOVRELD_B32_e32
> > -; SI: V_MOVRELD_B32_e32
> > -; SI: V_MOVRELS_B32_e32
> > -; SI: V_MOVRELS_B32_e32
> > -; SI: V_MOVRELS_B32_e32
> > -; SI: V_MOVRELS_B32_e32
> > +; SI: DS_WRITE_B64
> > +; SI: DS_WRITE_B64
> > +; SI: DS_READ_B64
> > +; SI: DS_READ_B64
> > define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) nounwind {
> >   %val = load <2 x i64> addrspace(1)* %in, align 16
> >   %array = alloca <2 x i64>, i32 16, align 16
> > diff --git a/test/CodeGen/R600/parallelandifcollapse.ll b/test/CodeGen/R600/parallelandifcollapse.ll
> > index 4afaf68..8a269e0 100644
> > --- a/test/CodeGen/R600/parallelandifcollapse.ll
> > +++ b/test/CodeGen/R600/parallelandifcollapse.ll
> > @@ -7,6 +7,12 @@
> > ; CHECK: AND_INT
> > ; CHECK-NEXT: AND_INT
> > ; CHECK-NEXT: OR_INT
> > +
> > +; FIXME: For some reason having the allocas here allowed the flatten cfg pass
> > +; to do its transfomation, however now that we are using local memory for
> > +; allocas, the transformation isn't happening.
> > +; XFAIL: *
> > +
> > define void @_Z9chk1D_512v() #0 {
> > entry:
> >   %a0 = alloca i32, align 4
> > diff --git a/test/CodeGen/R600/parallelorifcollapse.ll b/test/CodeGen/R600/parallelorifcollapse.ll
> > index b0db7cd..feca688 100644
> > --- a/test/CodeGen/R600/parallelorifcollapse.ll
> > +++ b/test/CodeGen/R600/parallelorifcollapse.ll
> > @@ -3,6 +3,11 @@
> > ;
> > ; CFG flattening should use parallel-or to generate branch conditions and
> > ; then merge if-regions with the same bodies.
> > +
> > +; FIXME: For some reason having the allocas here allowed the flatten cfg pass
> > +; to do its transfomation, however now that we are using local memory for
> > +; allocas, the transformation isn't happening.
> > +; XFAIL: *
> > ;
> > ; CHECK: OR_INT
> > ; CHECK-NEXT: OR_INT
> > diff --git a/test/CodeGen/R600/private-memory.ll b/test/CodeGen/R600/private-memory.ll
> > index d3453f2..c60c059 100644
> > --- a/test/CodeGen/R600/private-memory.ll
> > +++ b/test/CodeGen/R600/private-memory.ll
> > @@ -1,24 +1,17 @@
> > ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600-CHECK --check-prefix=FUNC
> > ; RUN: llc -verify-machineinstrs -march=r600 -mcpu=SI < %s | FileCheck %s --check-prefix=SI-CHECK --check-prefix=FUNC
> > 
> > -; This test checks that uses and defs of the AR register happen in the same
> > -; instruction clause.
> > -
> > ; FUNC-LABEL: @mova_same_clause
> > 
> > -; R600-CHECK: MOVA_INT
> > -; R600-CHECK-NOT: ALU clause
> > -; R600-CHECK: 0 + AR.x
> > -; R600-CHECK: MOVA_INT
> > -; R600-CHECK-NOT: ALU clause
> > -; R600-CHECK: 0 + AR.x
> > -
> > -; SI-CHECK: V_READFIRSTLANE_B32 vcc_lo
> > -; SI-CHECK: V_MOVRELD
> > -; SI-CHECK: S_CBRANCH
> > -; SI-CHECK: V_READFIRSTLANE_B32 vcc_lo
> > -; SI-CHECK: V_MOVRELD
> > -; SI-CHECK: S_CBRANCH
> > +; R600-CHECK: LDS_WRITE
> > +; R600-CHECK: LDS_WRITE
> > +; R600-CHECK: LDS_READ
> > +; R600-CHECK: LDS_READ
> > +
> > +; SI-CHECK: DS_WRITE_B32
> > +; SI-CHECK: DS_WRITE_B32
> > +; SI-CHECK: DS_READ_B32
> > +; SI-CHECK: DS_READ_B32
> > define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
> > entry:
> >   %stack = alloca [5 x i32], align 4
> > @@ -114,12 +107,8 @@ for.end:
> > 
> > ; FUNC-LABEL: @short_array
> > 
> > -; R600-CHECK: MOV {{\** *}}T{{[0-9]\.[XYZW]}}, literal
> > -; R600-CHECK: 65536
> > -; R600-CHECK: *
> > ; R600-CHECK: MOVA_INT
> > 
> > -; SI-CHECK: V_MOV_B32_e32 v{{[0-9]}}, 0x10000
> > ; SI-CHECK: V_MOVRELS_B32_e32
> > define void @short_array(i32 addrspace(1)* %out, i32 %index) {
> > entry:
> > @@ -137,10 +126,7 @@ entry:
> > 
> > ; FUNC-LABEL: @char_array
> > 
> > -; R600-CHECK: OR_INT {{\** *}}T{{[0-9]\.[XYZW]}}, {{[PVT0-9]+\.[XYZW]}}, literal
> > -; R600-CHECK: 256
> > -; R600-CHECK: *
> > -; R600-CHECK-NEXT: MOVA_INT
> > +; R600-CHECK: MOVA_INT
> > 
> > ; SI-CHECK: V_OR_B32_e32 v{{[0-9]}}, 0x100
> > ; SI-CHECK: V_MOVRELS_B32_e32
> > @@ -185,7 +171,9 @@ entry:
> > ; Test that two stack objects are not stored in the same register
> > ; The second stack object should be in T3.X
> > ; FUNC-LABEL: @no_overlap
> > -; R600-CHECK: MOV {{\** *}}T3.X
> > +; R600_CHECK: MOV
> > +; R600_CHECK: [[CHAN:[XYZW]]]+
> > +; R600-CHECK-NOT: [[CHAN]]+
> > ; SI-CHECK: V_MOV_B32_e32 v3
> > define void @no_overlap(i32 addrspace(1)* %out, i32 %in) {
> > entry:
> > diff --git a/test/CodeGen/R600/vector-alloca.ll b/test/CodeGen/R600/vector-alloca.ll
> > new file mode 100644
> > index 0000000..6543f6d
> > --- /dev/null
> > +++ b/test/CodeGen/R600/vector-alloca.ll
> > @@ -0,0 +1,74 @@
> > +; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG --check-prefix=FUNC %s
> > +; RUN: llc < %s -march=r600 -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=FUNC %s
> > +
> > +; FUNC-LABEL: @vector_read
> > +; EG: MOV
> > +; EG: MOV
> > +; EG: MOV
> > +; EG: MOV
> > +; EG: MOVA_INT
> > +define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
> > +entry:
> > +  %0 = alloca [4 x i32]
> > +  %x = getelementptr [4 x i32]* %0, i32 0, i32 0
> > +  %y = getelementptr [4 x i32]* %0, i32 0, i32 1
> > +  %z = getelementptr [4 x i32]* %0, i32 0, i32 2
> > +  %w = getelementptr [4 x i32]* %0, i32 0, i32 3
> > +  store i32 0, i32* %x
> > +  store i32 1, i32* %y
> > +  store i32 2, i32* %z
> > +  store i32 3, i32* %w
> > +  %1 = getelementptr [4 x i32]* %0, i32 0, i32 %index
> > +  %2 = load i32* %1
> > +  store i32 %2, i32 addrspace(1)* %out
> > +  ret void
> > +}
> > +
> > +; FUNC-LABEL: @vector_write
> > +; EG: MOV
> > +; EG: MOV
> > +; EG: MOV
> > +; EG: MOV
> > +; EG: MOVA_INT
> > +; EG: MOVA_INT
> > +define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
> > +entry:
> > +  %0 = alloca [4 x i32]
> > +  %x = getelementptr [4 x i32]* %0, i32 0, i32 0
> > +  %y = getelementptr [4 x i32]* %0, i32 0, i32 1
> > +  %z = getelementptr [4 x i32]* %0, i32 0, i32 2
> > +  %w = getelementptr [4 x i32]* %0, i32 0, i32 3
> > +  store i32 0, i32* %x
> > +  store i32 0, i32* %y
> > +  store i32 0, i32* %z
> > +  store i32 0, i32* %w
> > +  %1 = getelementptr [4 x i32]* %0, i32 0, i32 %w_index
> > +  store i32 1, i32* %1
> > +  %2 = getelementptr [4 x i32]* %0, i32 0, i32 %r_index
> > +  %3 = load i32* %2
> > +  store i32 %3, i32 addrspace(1)* %out
> > +  ret void
> > +}
> > +
> > +; This test should be optimize to:
> > +; store i32 0, i32 addrspace(1)* %out
> > +; FUNC-LABEL: @bitcast_gep
> > +; CHECK: STORE_RAW
> > +define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
> > +entry:
> > +  %0 = alloca [4 x i32]
> > +  %x = getelementptr [4 x i32]* %0, i32 0, i32 0
> > +  %y = getelementptr [4 x i32]* %0, i32 0, i32 1
> > +  %z = getelementptr [4 x i32]* %0, i32 0, i32 2
> > +  %w = getelementptr [4 x i32]* %0, i32 0, i32 3
> > +  store i32 0, i32* %x
> > +  store i32 0, i32* %y
> > +  store i32 0, i32* %z
> > +  store i32 0, i32* %w
> > +  %1 = getelementptr [4 x i32]* %0, i32 0, i32 1
> > +  %2 = bitcast i32* %1 to [4 x i32]*
> > +  %3 = getelementptr [4 x i32]* %2, i32 0, i32 0
> > +  %4 = load i32* %3
> > +  store i32 %4, i32 addrspace(1)* %out
> > +  ret void
> > +}
> > -- 
> > 1.8.1.5
> > 
> > _______________________________________________
> > llvm-commits mailing list
> > llvm-commits at cs.uiuc.edu
> > http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> 
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
-------------- next part --------------
A non-text attachment was scrubbed...
Name: 0001-R600-Use-LDS-and-vectors-for-private-memory.patch
Type: text/x-diff
Size: 49970 bytes
Desc: not available
URL: <http://lists.llvm.org/pipermail/llvm-commits/attachments/20140513/15bc8b98/attachment.patch>


More information about the llvm-commits mailing list