[PATCH] R600: Factorize code handling Const Read Port limitation

Wed Mar 13 13:49:59 PDT 2013

On Wed, Mar 13, 2013 at 09:12:41PM +0100, Vincent Lejeune wrote:
> ---
>  lib/Target/R600/AMDILISelDAGToDAG.cpp    | 34 ++++++++++----
>  lib/Target/R600/R600InstrInfo.cpp        | 54 ++++++++++++++++++++++
>  lib/Target/R600/R600InstrInfo.h          |  3 ++
>  lib/Target/R600/R600MachineScheduler.cpp | 77 ++++----------------------------
>  lib/Target/R600/R600MachineScheduler.h   |  3 +-
>  test/CodeGen/R600/kcache-fold-2.ll       | 52 +++++++++++++++++++++
>  6 files changed, 144 insertions(+), 79 deletions(-)
>  create mode 100644 test/CodeGen/R600/kcache-fold-2.ll
> 
> diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp b/lib/Target/R600/AMDILISelDAGToDAG.cpp
> index 0c7880d..05a1ea7 100644
> --- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
> +++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
> @@ -336,6 +336,7 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
>    return Result;
>  }
>  
> +

Whitespace
>  bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
>      const R600InstrInfo *TII, std::vector<SDValue> &Ops) {
>    int OperandIdx[] = {
> @@ -365,17 +366,34 @@ bool AMDGPUDAGToDAGISel::FoldOperands(unsigned Opcode,
>      SDValue Operand = Ops[OperandIdx[i] - 1];
>      switch (Operand.getOpcode()) {
>      case AMDGPUISD::CONST_ADDRESS: {
> -      if (i == 2)
> -        break;
>        SDValue CstOffset;
> -      if (!Operand.getValueType().isVector() &&
> -          SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset)) {
> -        Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
> -        Ops[SelIdx[i] - 1] = CstOffset;
> -        return true;
> +      if (Operand.getValueType().isVector() ||
> +          !SelectGlobalValueConstantOffset(Operand.getOperand(0), CstOffset))
> +        break;
> +
> +      // Gather others constants values
> +      std::vector<unsigned> Consts;
> +      for (unsigned j = 0; j < 3; j++) {
> +        int SrcIdx = OperandIdx[j];
> +        if (SrcIdx < 0)
> +          break;
> +        if (RegisterSDNode *Reg = dyn_cast<RegisterSDNode>(Ops[SrcIdx - 1])) {
> +          if (Reg->getReg() == AMDGPU::ALU_CONST) {
> +            ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Ops[SelIdx[j] - 1]);
> +            Consts.push_back(Cst->getZExtValue());
> +          }
> +        }
>        }
> +
> +      ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(CstOffset);
> +      Consts.push_back(Cst->getZExtValue());
> +      if (!TII->fitsConstReadLimitations(Consts))
> +        break;
> +
> +      Ops[OperandIdx[i] - 1] = CurDAG->getRegister(AMDGPU::ALU_CONST, MVT::f32);
> +      Ops[SelIdx[i] - 1] = CstOffset;
> +      return true;
>        }
> -      break;
>      case ISD::FNEG:
>        if (NegIdx[i] < 0)
>          break;
> diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
> index be3318a..0865098 100644
> --- a/lib/Target/R600/R600InstrInfo.cpp
> +++ b/lib/Target/R600/R600InstrInfo.cpp
> @@ -139,6 +139,60 @@ bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
>            (TargetFlags & R600_InstFlag::OP3));
>  }
>  
> +bool
> +R600InstrInfo::fitsConstReadLimitations(const std::vector<unsigned> &Consts)
> +    const {
> +  assert (Consts.size() <= 12 && "Too many operands in instructions group");
> +  unsigned Pair1 = 0, Pair2 = 0;
> +  for (unsigned i = 0, n = Consts.size(); i < n; ++i) {
> +    unsigned ReadConstHalf = Consts[i] & 2;
> +    unsigned ReadConstIndex = Consts[i] & (~3);
> +    unsigned ReadHalfConst = ReadConstIndex | ReadConstHalf;
> +    if (!Pair1) {
> +      Pair1 = ReadHalfConst;
> +      continue;
> +    }
> +    if (Pair1 == ReadHalfConst)
> +      continue;
> +    if (!Pair2) {
> +      Pair2 = ReadHalfConst;
> +      continue;
> +    }
> +    if (Pair2 != ReadHalfConst)
> +      return false;
> +  }
> +  return true;
> +}
> +
> +bool
> +R600InstrInfo::canBundle(const std::vector<MachineInstr *> &MIs) const {
> +  std::vector<unsigned> Consts;
> +  for (unsigned i = 0, n = MIs.size(); i < n; i++) {
> +    const MachineInstr *MI = MIs[i];
> +
> +    const R600Operands::Ops OpTable[3][2] = {
> +      {R600Operands::SRC0, R600Operands::SRC0_SEL},
> +      {R600Operands::SRC1, R600Operands::SRC1_SEL},
> +      {R600Operands::SRC2, R600Operands::SRC2_SEL},
> +    };
> +
> +    if (!isALUInstr(MI->getOpcode()))
> +      continue;
> +
> +    for (unsigned j = 0; j < 3; j++) {
> +      int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
> +      if (SrcIdx < 0)
> +        break;
> +      if (MI->getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) {
> +        unsigned Const = MI->getOperand(
> +            getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
> +        Consts.push_back(Const);
> +      }
> +    }
> +  }
> +  return fitsConstReadLimitations(Consts);
> +}
> +
>  DFAPacketizer *R600InstrInfo::CreateTargetScheduleState(const TargetMachine *TM,
>      const ScheduleDAG *DAG) const {
>    const InstrItineraryData *II = TM->getInstrItineraryData();
> diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
> index efe721c..bf9569e 100644
> --- a/lib/Target/R600/R600InstrInfo.h
> +++ b/lib/Target/R600/R600InstrInfo.h
> @@ -53,6 +53,9 @@ namespace llvm {
>    /// \returns true if this \p Opcode represents an ALU instruction.
>    bool isALUInstr(unsigned Opcode) const;
>  
> +  bool fitsConstReadLimitations(const std::vector<unsigned>&) const;
> +  bool canBundle(const std::vector<MachineInstr *> &) const;
> +
>    /// \breif Vector instructions are instructions that must fill all
>    /// instruction slots within an instruction group.
>    bool isVector(const MachineInstr &MI) const;
> diff --git a/lib/Target/R600/R600MachineScheduler.cpp b/lib/Target/R600/R600MachineScheduler.cpp
> index ee8d672..b4d9779 100644
> --- a/lib/Target/R600/R600MachineScheduler.cpp
> +++ b/lib/Target/R600/R600MachineScheduler.cpp
> @@ -24,7 +24,7 @@
>  #include "llvm/PassManager.h"
>  #include "llvm/Support/raw_ostream.h"
>  #include <set>
> -
> +#include <iostream>

You should double check the LLVM coding style guide, but I think system
headers need to be sorted.

>  using namespace llvm;
>  
>  static
> @@ -222,7 +222,6 @@ void R600SchedStrategy::initialize(ScheduleDAGMI *dag) {
>    CurInstKind = IDOther;
>    CurEmitted = 0;
>    OccupedSlotsMask = 15;
> -  memset(InstructionsGroupCandidate, 0, sizeof(InstructionsGroupCandidate));
>    InstKindLimit[IDAlu] = 120; // 120 minus 8 for security
>    InstKindLimit[IDOther] = 32;
>  
> @@ -503,79 +502,19 @@ int R600SchedStrategy::getInstKind(SUnit* SU) {
>    }
>  }
>  
> -class ConstPairs {
> -private:
> -  unsigned XYPair;
> -  unsigned ZWPair;
> -public:
> -  ConstPairs(unsigned ReadConst[3]) : XYPair(0), ZWPair(0) {
> -    for (unsigned i = 0; i < 3; i++) {
> -      unsigned ReadConstChan = ReadConst[i] & 3;
> -      unsigned ReadConstIndex = ReadConst[i] & (~3);
> -      if (ReadConstChan < 2) {
> -        if (!XYPair) {
> -          XYPair = ReadConstIndex;
> -        }
> -      } else {
> -        if (!ZWPair) {
> -          ZWPair = ReadConstIndex;
> -        }
> -      }
> -    }
> -  }
> -
> -  bool isCompatibleWith(const ConstPairs& CP) const {
> -    return (!XYPair || !CP.XYPair || CP.XYPair == XYPair) &&
> -        (!ZWPair || !CP.ZWPair || CP.ZWPair == ZWPair);
> -  }
> -};
> -
> -static
> -const ConstPairs getPairs(const R600InstrInfo *TII, const MachineInstr& MI) {
> -  unsigned ReadConsts[3] = {0, 0, 0};
> -  R600Operands::Ops OpTable[3][2] = {
> -    {R600Operands::SRC0, R600Operands::SRC0_SEL},
> -    {R600Operands::SRC1, R600Operands::SRC1_SEL},
> -    {R600Operands::SRC2, R600Operands::SRC2_SEL},
> -  };
> -
> -  if (!TII->isALUInstr(MI.getOpcode()))
> -    return ConstPairs(ReadConsts);
> -
> -  for (unsigned i = 0; i < 3; i++) {
> -    int SrcIdx = TII->getOperandIdx(MI.getOpcode(), OpTable[i][0]);
> -    if (SrcIdx < 0)
> -      break;
> -    if (MI.getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST)
> -      ReadConsts[i] =MI.getOperand(
> -          TII->getOperandIdx(MI.getOpcode(), OpTable[i][1])).getImm();
> -  }
> -  return ConstPairs(ReadConsts);
> -}
> -
> -bool
> -R600SchedStrategy::isBundleable(const MachineInstr& MI) {
> -  const ConstPairs &MIPair = getPairs(TII, MI);
> -  for (unsigned i = 0; i < 4; i++) {
> -    if (!InstructionsGroupCandidate[i])
> -      continue;
> -    const ConstPairs &IGPair = getPairs(TII,
> -        *InstructionsGroupCandidate[i]->getInstr());
> -    if (!IGPair.isCompatibleWith(MIPair))
> -      return false;
> -  }
> -  return true;
> -}
> -
>  SUnit *R600SchedStrategy::PopInst(std::vector<SUnit *> &Q) {
>    if (Q.empty())
>      return NULL;
>    for (std::vector<SUnit *>::iterator It = Q.begin(), E = Q.end();
>        It != E; ++It) {
>      SUnit *SU = *It;
> -    if (isBundleable(*SU->getInstr())) {
> +    InstructionsGroupCandidate.push_back(SU->getInstr());
> +    if (TII->canBundle(InstructionsGroupCandidate)) {
> +      InstructionsGroupCandidate.pop_back();
>        Q.erase(It);
>        return SU;
> +    } else {
> +      InstructionsGroupCandidate.pop_back();
>      }
>    }
>    return NULL;
> @@ -596,7 +535,7 @@ void R600SchedStrategy::PrepareNextSlot() {
>    DEBUG(dbgs() << "New Slot\n");
>    assert (OccupedSlotsMask && "Slot wasn't filled");
>    OccupedSlotsMask = 0;
> -  memset(InstructionsGroupCandidate, 0, sizeof(InstructionsGroupCandidate));
> +  InstructionsGroupCandidate.clear();
>    LoadAlu();
>  }
>  
> @@ -666,7 +605,7 @@ SUnit* R600SchedStrategy::pickAlu() {
>          SUnit *SU = AttemptFillSlot(Chan);
>          if (SU) {
>            OccupedSlotsMask |= (1 << Chan);
> -          InstructionsGroupCandidate[Chan] = SU;
> +          InstructionsGroupCandidate.push_back(SU->getInstr());
>            return SU;
>          }
>        }
> diff --git a/lib/Target/R600/R600MachineScheduler.h b/lib/Target/R600/R600MachineScheduler.h
> index 4b1f6c3..47b1d56 100644
> --- a/lib/Target/R600/R600MachineScheduler.h
> +++ b/lib/Target/R600/R600MachineScheduler.h
> @@ -100,7 +100,7 @@ public:
>    virtual void releaseBottomNode(SUnit *SU);
>  
>  private:
> -  SUnit *InstructionsGroupCandidate[4];
> +  std::vector<MachineInstr *> InstructionsGroupCandidate;
>  
>    int getInstKind(SUnit *SU);
>    bool regBelongsToClass(unsigned Reg, const TargetRegisterClass *RC) const;
> @@ -114,7 +114,6 @@ private:
>    void AssignSlot(MachineInstr *MI, unsigned Slot);
>    SUnit* pickAlu();
>    SUnit* pickOther(int QID);
> -  bool isBundleable(const MachineInstr& MI);
>    void MoveUnits(ReadyQueue *QSrc, ReadyQueue *QDst);
>  };
>  
> diff --git a/test/CodeGen/R600/kcache-fold-2.ll b/test/CodeGen/R600/kcache-fold-2.ll
> new file mode 100644
> index 0000000..77d0052
> --- /dev/null
> +++ b/test/CodeGen/R600/kcache-fold-2.ll

For performance reasons (I think), it's prefered to stick similar tests
into the same file, so this test should be moved to kcache-fold.ll.
You will also need to add a CHECK: line for each function name in the
new file.  Take a look at some of the other tests to see how this is done.

-Tom

> @@ -0,0 +1,52 @@
> +;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> +
> +; CHECK-NOT: MOV
> +
> +define void @main() {
> +main_body:
> +  %0 = load <4 x float> addrspace(8)* null
> +  %1 = extractelement <4 x float> %0, i32 0
> +  %2 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
> +  %3 = extractelement <4 x float> %2, i32 0
> +  %4 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
> +  %5 = extractelement <4 x float> %4, i32 1
> +  %6 = fcmp ult float %1, 0.000000e+00
> +  %7 = select i1 %6, float %3, float %5
> +  %8 = load <4 x float> addrspace(8)* null
> +  %9 = extractelement <4 x float> %8, i32 1
> +  %10 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
> +  %11 = extractelement <4 x float> %10, i32 0
> +  %12 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
> +  %13 = extractelement <4 x float> %12, i32 1
> +  %14 = fcmp ult float %9, 0.000000e+00
> +  %15 = select i1 %14, float %11, float %13
> +  %16 = load <4 x float> addrspace(8)* null
> +  %17 = extractelement <4 x float> %16, i32 2
> +  %18 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
> +  %19 = extractelement <4 x float> %18, i32 3
> +  %20 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
> +  %21 = extractelement <4 x float> %20, i32 2
> +  %22 = fcmp ult float %17, 0.000000e+00
> +  %23 = select i1 %22, float %19, float %21
> +  %24 = load <4 x float> addrspace(8)* null
> +  %25 = extractelement <4 x float> %24, i32 3
> +  %26 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
> +  %27 = extractelement <4 x float> %26, i32 3
> +  %28 = load <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
> +  %29 = extractelement <4 x float> %28, i32 2
> +  %30 = fcmp ult float %25, 0.000000e+00
> +  %31 = select i1 %30, float %27, float %29
> +  %32 = call float @llvm.AMDIL.clamp.(float %7, float 0.000000e+00, float 1.000000e+00)
> +  %33 = call float @llvm.AMDIL.clamp.(float %15, float 0.000000e+00, float 1.000000e+00)
> +  %34 = call float @llvm.AMDIL.clamp.(float %23, float 0.000000e+00, float 1.000000e+00)
> +  %35 = call float @llvm.AMDIL.clamp.(float %31, float 0.000000e+00, float 1.000000e+00)
> +  %36 = insertelement <4 x float> undef, float %32, i32 0
> +  %37 = insertelement <4 x float> %36, float %33, i32 1
> +  %38 = insertelement <4 x float> %37, float %34, i32 2
> +  %39 = insertelement <4 x float> %38, float %35, i32 3
> +  call void @llvm.R600.store.swizzle(<4 x float> %39, i32 0, i32 0)
> +  ret void
> +}
> +
> +declare float @llvm.AMDIL.clamp.(float, float, float) readnone
> +declare void @llvm.R600.store.swizzle(<4 x float>, i32, i32)
> -- 
> 1.8.1.4
> 
> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits