R600: Generate native ALU instructions

Mon Apr 22 20:59:56 PDT 2013

On Mon, Apr 22, 2013 at 09:28:59AM -0700, Vincent Lejeune wrote:
> Hi,
> 
> this set of patches make llvm generate native instructions for ALU.
> Now r600 backend emits all instructions as natively encoded ;
> however we still emit the INSTR_NATIVE data before each instructions atm.
> We can drop this behaviour but I'd like to keep it one or 2 weeks so that people can test it against mesa.
> 
> Vincent

Hi Vincent,

Nice work, this is a major milestone for the backend.

Just a few minor comments, other than that this series is:

Reviewed-by: Tom Stellard <thomas.stellard at amd.com>

Also, I think you could add a few test cases for the trans only
instructions by testing that they are the last instruction in a group.

I would like to test these with compute before you push them, but I'm
having trouble applying them.  Could you push a branch somewhere with
all your outstanding patches?  I will try to review the other patches
you posted today, so you can push those to master.

Thanks,
Tom

> From 3a6137894564734037007241d928879706d70d8f Mon Sep 17 00:00:00 2001
> From: Vincent Lejeune <vljn at ovi.com>
> Date: Mon, 22 Apr 2013 17:23:01 +0200
> Subject: [PATCH 1/5] R600: add a istransonly function
> 
> ---
>  lib/Target/R600/R600InstrInfo.cpp | 47 +++++++++++++++++++++++++++++++++++++++
>  lib/Target/R600/R600InstrInfo.h   |  3 +++
>  2 files changed, 50 insertions(+)
> 
> diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
> index e96f563..bfdf4c9 100644
> --- a/lib/Target/R600/R600InstrInfo.cpp
> +++ b/lib/Target/R600/R600InstrInfo.cpp
> @@ -140,6 +140,53 @@ bool R600InstrInfo::isALUInstr(unsigned Opcode) const {
>            (TargetFlags & R600_InstFlag::OP3));
>  }
>  
> +bool R600InstrInfo::isTransOnly(unsigned Opcode) const {
> +  //TODO: use FuncUnit in R600Instructions.td
> +  switch (Opcode) {
> +  case AMDGPU::INT_TO_FLT_eg:
> +  case AMDGPU::INT_TO_FLT_r600:
> +  case AMDGPU::FLT_TO_INT_eg:
> +  case AMDGPU::FLT_TO_INT_r600:
> +  case AMDGPU::UINT_TO_FLT_eg:
> +  case AMDGPU::UINT_TO_FLT_r600:
> +  case AMDGPU::FLT_TO_UINT_eg:
> +  case AMDGPU::FLT_TO_UINT_r600:
> +  case AMDGPU::COS_eg:
> +  case AMDGPU::COS_r600:
> +  case AMDGPU::SIN_eg:
> +  case AMDGPU::SIN_r600:
> +  case AMDGPU::LOG_IEEE_eg:
> +  case AMDGPU::LOG_IEEE_r600:
> +  case AMDGPU::EXP_IEEE_eg:
> +  case AMDGPU::EXP_IEEE_r600:
> +  case AMDGPU::RECIP_IEEE_eg:
> +  case AMDGPU::RECIP_IEEE_r600:
> +  case AMDGPU::RECIP_UINT_eg:
> +  case AMDGPU::RECIP_UINT_r600:
> +  case AMDGPU::MULLO_INT_eg:
> +  case AMDGPU::MULLO_INT_r600:
> +  case AMDGPU::MULHI_INT_eg:
> +  case AMDGPU::MULHI_INT_r600:
> +  case AMDGPU::MULLO_UINT_eg:
> +  case AMDGPU::MULLO_UINT_r600:
> +  case AMDGPU::MULHI_UINT_eg:
> +  case AMDGPU::MULHI_UINT_r600:
> +  case AMDGPU::RECIPSQRT_CLAMPED_eg:
> +  case AMDGPU::RECIPSQRT_CLAMPED_r600:
> +  case AMDGPU::RECIP_CLAMPED_eg:
> +  case AMDGPU::RECIP_CLAMPED_r600:
> +  case AMDGPU::RECIPSQRT_IEEE_eg:
> +  case AMDGPU::RECIPSQRT_IEEE_r600:
> +    return true;
> +  default:
> +    return false;
> +  }
> +}
> +

I would rather see this function implementing using TSFlags, so that we
can just set the TransOnly bit in the .td file.  Or even better, is
there some way to query FuncUnit from the instruction (maybe this is
what you are talking about in your TODO comment)?

> +bool R600InstrInfo::isTransOnly(const MachineInstr *MI) const {
> +  return isTransOnly(MI->getOpcode());
> +}
> +
>  bool R600InstrInfo::isCayman() const {
>    return ST.device()->getGeneration() > AMDGPUDeviceInfo::HD5XXX;
>  }
> diff --git a/lib/Target/R600/R600InstrInfo.h b/lib/Target/R600/R600InstrInfo.h
> index 136023f..e0ba12b 100644
> --- a/lib/Target/R600/R600InstrInfo.h
> +++ b/lib/Target/R600/R600InstrInfo.h
> @@ -55,6 +55,9 @@ namespace llvm {
>    /// \returns true if this \p Opcode represents an ALU instruction.
>    bool isALUInstr(unsigned Opcode) const;
>  
> +  bool isTransOnly(unsigned Opcode) const;
> +  bool isTransOnly(const MachineInstr *MI) const;
> +
>    bool usesVertexCache(unsigned Opcode) const;
>    bool usesVertexCache(const MachineInstr *MI) const;
>    bool usesTextureCache(unsigned Opcode) const;
> -- 
> 1.8.1.4
> 
> From 428138a6c9ec4bb7d6483658acf5c2ab262bafb5 Mon Sep 17 00:00:00 2001
> From: Vincent Lejeune <vljn at ovi.com>
> Date: Sat, 20 Apr 2013 03:10:21 +0200
> Subject: [PATCH 5/5] R600: use native for alu
> 
> ---
>  lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp |   8 +-
>  lib/Target/R600/R600ControlFlowFinalizer.cpp       | 109 ++++++++++++++++++++-
>  lib/Target/R600/R600Instructions.td                |  17 ++++
>  lib/Target/R600/R600RegisterInfo.td                |   5 +-
>  4 files changed, 135 insertions(+), 4 deletions(-)
> 
> diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
> index 02ce566..caeab9a 100644
> --- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
> +++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
> @@ -143,6 +143,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
>      EmitFCInstr(MI, OS);
>    } else if (MI.getOpcode() == AMDGPU::RETURN ||
>      MI.getOpcode() == AMDGPU::FETCH_CLAUSE ||
> +    MI.getOpcode() == AMDGPU::ALU_CLAUSE ||
>      MI.getOpcode() == AMDGPU::BUNDLE ||
>      MI.getOpcode() == AMDGPU::KILL) {
>      return;
> @@ -254,7 +255,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
>      case AMDGPU::CF_ALU:
>      case AMDGPU::CF_ALU_PUSH_BEFORE: {
>        uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
> -      EmitByte(INSTR_CFALU, OS);
> +      EmitByte(INSTR_NATIVE, OS);
>        Emit(Inst, OS);
>        break;
>      }
> @@ -293,7 +294,10 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI, raw_ostream &OS,
>        break;
>      }
>      default:
> -      EmitALUInstr(MI, Fixups, OS);
> +      uint64_t Inst = getBinaryCodeForInstr(MI, Fixups);
> +      EmitByte(INSTR_NATIVE, OS);
> +      Emit(Inst, OS);
> +//      EmitALUInstr(MI, Fixups, OS);

Stray commented out code here.

>        break;
>      }
>    }
> diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
> index def2fa5..09acc6c 100644
> --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
> +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
> @@ -173,6 +173,96 @@ private:
>      return ClauseFile(MIb, ClauseContent);
>    }
>  
> +  void getLiteral(MachineInstr *MI, std::vector<unsigned> &Lits) const {
> +    for (unsigned i = 0, e = MI->getNumOperands(); i < e; ++i) {
> +      MachineOperand &MO = MI->getOperand(i);
> +      if (!MO.isReg())
> +        continue;
> +      if (MO.getReg() != AMDGPU::ALU_LITERAL_X)
> +        continue;
> +      switch (Lits.size()) {
> +      default: llvm_unreachable("Too many literals in Instruction Group");
> +      case 3:
> +        MO.setReg(AMDGPU::ALU_LITERAL_W);
> +        break;
> +      case 2:
> +        MO.setReg(AMDGPU::ALU_LITERAL_Z);
> +        break;
> +      case 1:
> +        MO.setReg(AMDGPU::ALU_LITERAL_Y);
> +        break;
> +      case 0:
> +        break;
> +      }
> +      unsigned ImmIdx = TII->getOperandIdx(MI->getOpcode(), R600Operands::IMM);
> +      Lits.push_back(MI->getOperand(ImmIdx).getImm());
> +      break;
> +    }
> +  }
> +
> +  MachineBasicBlock::iterator insertLiterals(
> +      MachineBasicBlock::iterator InsertPos,
> +      const std::vector<unsigned> &Literals) const {
> +    MachineBasicBlock *MBB = InsertPos->getParent();
> +    for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
> +      unsigned LiteralPair0 = Literals[i];
> +      unsigned LiteralPair1 = (i + 1 < e)?Literals[i + 1]:0;
> +      InsertPos = BuildMI(MBB, InsertPos->getDebugLoc(),
> +          TII->get(AMDGPU::LITERALS))
> +          .addImm(LiteralPair0)
> +          .addImm(LiteralPair1);
> +    }
> +    return InsertPos;
> +  }
> +
> +  ClauseFile
> +  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator &I)
> +      const {
> +    MachineBasicBlock::iterator ClauseHead = I;
> +    std::vector<MachineInstr *> ClauseContent;
> +    I++;
> +    for (MachineBasicBlock::instr_iterator E = MBB.instr_end(); I != E;) {
> +      if (IsTrivialInst(I)) {
> +        ++I;
> +        continue;
> +      }
> +      if (!I->isBundle() && !TII->isALUInstr(I->getOpcode()))
> +        break;
> +      std::vector<unsigned> Literals;
> +      if (I->isBundle()) {
> +        MachineInstr *DeleteMI = I;
> +        MachineBasicBlock::instr_iterator BI = I.getInstrIterator();
> +        while (++BI != E && BI->isBundledWithPred()) {
> +          BI->unbundleFromPred();
> +          for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i) {
> +            MachineOperand &MO = BI->getOperand(i);
> +            if (MO.isReg() && MO.isInternalRead())
> +              MO.setIsInternalRead(false);
> +          }
> +          getLiteral(BI, Literals);
> +          ClauseContent.push_back(BI);
> +        }
> +        I = BI;
> +        DeleteMI->eraseFromParent();
> +      } else {
> +        getLiteral(I, Literals);
> +        ClauseContent.push_back(I);
> +        I++;
> +      }
> +      for (unsigned i = 0, e = Literals.size(); i < e; i+=2) {
> +        unsigned literal0 = Literals[i];
> +        unsigned literal2 = (i + 1 < e)?Literals[i + 1]:0;
> +        MachineInstr *MILit = BuildMI(MBB, I, I->getDebugLoc(),
> +            TII->get(AMDGPU::LITERALS))
> +            .addImm(literal0)
> +            .addImm(literal2);
> +        ClauseContent.push_back(MILit);
> +      }
> +    }
> +    ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
> +    return ClauseFile(ClauseHead, ClauseContent);
> +  }
> +
>    void
>    EmitFetchClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
>        unsigned &CfCount) {
> @@ -186,6 +276,19 @@ private:
>      CfCount += 2 * Clause.second.size();
>    }
>  
> +  void
> +  EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
> +      unsigned &CfCount) {
> +    CounterPropagateAddr(Clause.first, CfCount);
> +    MachineBasicBlock *BB = Clause.first->getParent();
> +    BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
> +        .addImm(CfCount);
> +    for (unsigned i = 0, e = Clause.second.size(); i < e; ++i) {
> +      BB->splice(InsertPos, BB, Clause.second[i]);
> +    }
> +    CfCount += Clause.second.size();
> +  }
> +
>    void CounterPropagateAddr(MachineInstr *MI, unsigned Addr) const {
>      MI->getOperand(0).setImm(Addr + MI->getOperand(0).getImm());
>    }
> @@ -242,7 +345,7 @@ public:
>              getHWInstrDesc(CF_CALL_FS));
>          CfCount++;
>        }
> -      std::vector<ClauseFile> FetchClauses;
> +      std::vector<ClauseFile> FetchClauses, AluClauses;
>        for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
>            I != E;) {
>          if (TII->usesTextureCache(I) || TII->usesVertexCache(I)) {
> @@ -260,6 +363,8 @@ public:
>            MaxStack = std::max(MaxStack, CurrentStack);
>            hasPush = true;
>          case AMDGPU::CF_ALU:
> +          I = MI;
> +          AluClauses.push_back(MakeALUClause(MBB, I));
>          case AMDGPU::EG_ExportBuf:
>          case AMDGPU::EG_ExportSwz:
>          case AMDGPU::R600_ExportBuf:
> @@ -373,6 +478,8 @@ public:
>            }
>            for (unsigned i = 0, e = FetchClauses.size(); i < e; i++)
>              EmitFetchClause(I, FetchClauses[i], CfCount);
> +          for (unsigned i = 0, e = AluClauses.size(); i < e; i++)
> +            EmitALUClause(I, AluClauses[i], CfCount);
>          }
>          default:
>            break;
> diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
> index 0b52dd1..7ef28c4 100644
> --- a/lib/Target/R600/R600Instructions.td
> +++ b/lib/Target/R600/R600Instructions.td
> @@ -941,6 +941,23 @@ def FETCH_CLAUSE : AMDGPUInst <(outs),
>    let Inst = num;
>  }
>  
> +def ALU_CLAUSE : AMDGPUInst <(outs),
> +(ins i32imm:$addr), "ALU clause starting at $addr:", [] > {
> +  field bits<8> Inst;
> +  bits<8> num;
> +  let Inst = num;
> +}
> +
> +def LITERALS : AMDGPUInst <(outs),
> +(ins LITERAL:$literal1, LITERAL:$literal2), "$literal1, $literal2", [] > {
> +  field bits<64> Inst;
> +  bits<32> literal1;
> +  bits<32> literal2;
> +
> +  let Inst{31-0} = literal1;
> +  let Inst{63-32} = literal2;
> +}
> +
>  def PAD : AMDGPUInst <(outs), (ins), "PAD", [] > {
>    field bits<64> Inst;
>  }
> diff --git a/lib/Target/R600/R600RegisterInfo.td b/lib/Target/R600/R600RegisterInfo.td
> index 03f4976..ec5838b 100644
> --- a/lib/Target/R600/R600RegisterInfo.td
> +++ b/lib/Target/R600/R600RegisterInfo.td
> @@ -88,7 +88,10 @@ def NEG_ONE : R600Reg<"-1.0", 249>;
>  def ONE_INT : R600Reg<"1", 250>;
>  def HALF : R600Reg<"0.5", 252>;
>  def NEG_HALF : R600Reg<"-0.5", 252>;
> -def ALU_LITERAL_X : R600Reg<"literal.x", 253>;
> +def ALU_LITERAL_X : R600RegWithChan<"literal.x", 253, "X">;
> +def ALU_LITERAL_Y : R600RegWithChan<"literal.x", 253, "Y">;
> +def ALU_LITERAL_Z : R600RegWithChan<"literal.x", 253, "Z">;
> +def ALU_LITERAL_W : R600RegWithChan<"literal.x", 253, "W">;
>  def PV_X : R600Reg<"pv.x", 254>;
>  def PREDICATE_BIT : R600Reg<"PredicateBit", 0>;
>  def PRED_SEL_OFF: R600Reg<"Pred_sel_off", 0>;
> -- 
> 1.8.1.4
> 

> From 16e6d43a151e77c0507e363302ca7bb0a24f6aca Mon Sep 17 00:00:00 2001
> From: Vincent Lejeune <vljn at ovi.com>
> Date: Tue, 9 Apr 2013 15:10:06 +0200
> Subject: [PATCH 4/5] R600: Packetize instructions
> 
> ---
>  lib/Target/R600/AMDGPU.h                |   1 +
>  lib/Target/R600/AMDGPUTargetMachine.cpp |   3 +-
>  lib/Target/R600/CMakeLists.txt          |   1 +
>  lib/Target/R600/R600InstrInfo.cpp       |  11 +-
>  lib/Target/R600/R600Packetizer.cpp      | 365 ++++++++++++++++++++++++++++++++
>  5 files changed, 379 insertions(+), 2 deletions(-)
>  create mode 100644 lib/Target/R600/R600Packetizer.cpp
> 
> diff --git a/lib/Target/R600/AMDGPU.h b/lib/Target/R600/AMDGPU.h
> index 0b01433..9792bd8 100644
> --- a/lib/Target/R600/AMDGPU.h
> +++ b/lib/Target/R600/AMDGPU.h
> @@ -24,6 +24,7 @@ class AMDGPUTargetMachine;
>  FunctionPass* createR600KernelParametersPass(const DataLayout *TD);
>  FunctionPass *createR600ExpandSpecialInstrsPass(TargetMachine &tm);
>  FunctionPass *createR600EmitClauseMarkers(TargetMachine &tm);
> +FunctionPass *createR600Packetizer(TargetMachine &tm);
>  FunctionPass *createR600ControlFlowFinalizer(TargetMachine &tm);
>  
>  // SI Passes
> diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
> index e7ea876..0ec67ce 100644
> --- a/lib/Target/R600/AMDGPUTargetMachine.cpp
> +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
> @@ -153,8 +153,9 @@ bool AMDGPUPassConfig::addPreEmitPass() {
>      addPass(createAMDGPUCFGStructurizerPass(*TM));
>      addPass(createR600EmitClauseMarkers(*TM));
>      addPass(createR600ExpandSpecialInstrsPass(*TM));
> -    addPass(createR600ControlFlowFinalizer(*TM));
>      addPass(&FinalizeMachineBundlesID);
> +    addPass(createR600Packetizer(*TM));
> +    addPass(createR600ControlFlowFinalizer(*TM));
>    } else {
>      addPass(createSILowerControlFlowPass(*TM));
>    }
> diff --git a/lib/Target/R600/CMakeLists.txt b/lib/Target/R600/CMakeLists.txt
> index 8efba58..2ad2047 100644
> --- a/lib/Target/R600/CMakeLists.txt
> +++ b/lib/Target/R600/CMakeLists.txt
> @@ -42,6 +42,7 @@ add_llvm_target(R600CodeGen
>    R600ISelLowering.cpp
>    R600MachineFunctionInfo.cpp
>    R600MachineScheduler.cpp
> +  R600Packetizer.cpp
>    R600RegisterInfo.cpp
>    SIAnnotateControlFlow.cpp
>    SIInsertWaits.cpp
> diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
> index 8e5ee7c..ecb2476 100644
> --- a/lib/Target/R600/R600InstrInfo.cpp
> +++ b/lib/Target/R600/R600InstrInfo.cpp
> @@ -252,10 +252,19 @@ R600InstrInfo::canBundle(const std::vector<MachineInstr *> &MIs) const {
>        int SrcIdx = getOperandIdx(MI->getOpcode(), OpTable[j][0]);
>        if (SrcIdx < 0)
>          break;
> -      if (MI->getOperand(SrcIdx).getReg() == AMDGPU::ALU_CONST) {
> +      unsigned Reg = MI->getOperand(SrcIdx).getReg();
> +      if (Reg == AMDGPU::ALU_CONST) {
>          unsigned Const = MI->getOperand(
>              getOperandIdx(MI->getOpcode(), OpTable[j][1])).getImm();
>          Consts.push_back(Const);
> +        continue;
> +      }
> +      if (AMDGPU::R600_KC0RegClass.contains(Reg) ||
> +          AMDGPU::R600_KC1RegClass.contains(Reg)) {
> +        unsigned Index = RI.getEncodingValue(Reg) & 0xff;
> +        unsigned Chan = RI.getHWRegChan(Reg);
> +        Consts.push_back((Index << 2) | Chan);
> +        continue;
>        }
>      }
>    }
> diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
> new file mode 100644
> index 0000000..a66b3ea
> --- /dev/null
> +++ b/lib/Target/R600/R600Packetizer.cpp
> @@ -0,0 +1,365 @@
> +//===----- HexagonPacketizer.cpp - vliw packetizer ---------------------===//

Should be R600Packetizer.cpp

> +//
> +//                     The LLVM Compiler Infrastructure
> +//
> +// This file is distributed under the University of Illinois Open Source
> +// License. See LICENSE.TXT for details.
> +//
> +//===----------------------------------------------------------------------===//
> +//
> +
You need to add /// \file here for doxygen.
> +//
> +//===----------------------------------------------------------------------===//
> +
> +#ifndef R600PACKETIZER_CPP
> +#define R600PACKETIZER_CPP
> +
> +#define DEBUG_TYPE "packets"
> +#include "llvm/Support/Debug.h"
> +#include "llvm/Support/raw_ostream.h"
> +#include "llvm/CodeGen/DFAPacketizer.h"
> +#include "llvm/CodeGen/Passes.h"
> +#include "llvm/CodeGen/MachineFunctionPass.h"
> +#include "llvm/CodeGen/MachineDominators.h"
> +#include "llvm/CodeGen/MachineLoopInfo.h"
> +#include "llvm/CodeGen/ScheduleDAG.h"
> +#include "AMDGPU.h"
> +#include "R600InstrInfo.h"
> +
> +namespace llvm {
> +
> +class R600Packetizer : public MachineFunctionPass {
> +
> +public:
> +  static char ID;
> +  R600Packetizer(const TargetMachine &TM) : MachineFunctionPass(ID) {}
> +
> +  void getAnalysisUsage(AnalysisUsage &AU) const {
> +    AU.setPreservesCFG();
> +    AU.addRequired<MachineDominatorTree>();
> +    AU.addPreserved<MachineDominatorTree>();
> +    AU.addRequired<MachineLoopInfo>();
> +    AU.addPreserved<MachineLoopInfo>();
> +    MachineFunctionPass::getAnalysisUsage(AU);
> +  }
> +
> +  const char *getPassName() const {
> +    return "R600 Packetizer";
> +  }
> +
> +  bool runOnMachineFunction(MachineFunction &Fn);
> +};
> +char R600Packetizer::ID = 0;
> +
> +class R600PacketizerList : public VLIWPacketizerList {
> +
> +private:
> +  const R600InstrInfo *TII;
> +  const R600RegisterInfo &TRI;
> +
> +  enum BankSwizzle {
> +    ALU_VEC_012 = 0,
> +    ALU_VEC_021,
> +    ALU_VEC_120,
> +    ALU_VEC_102,
> +    ALU_VEC_201,
> +    ALU_VEC_210
> +  };
> +
> +  unsigned getSlot(const MachineInstr *MI) const {
> +    return TRI.getHWRegChan(MI->getOperand(0).getReg());
> +  }
> +
> +public:
> +  // Ctor.
> +  R600PacketizerList(MachineFunction &MF, MachineLoopInfo &MLI,
> +                        MachineDominatorTree &MDT)
> +  : VLIWPacketizerList(MF, MLI, MDT, true),
> +    TII (static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo())),
> +    TRI(TII->getRegisterInfo()) { }
> +
> +  // initPacketizerState - initialize some internal flags.
> +  void initPacketizerState() { }
> +
> +  // ignorePseudoInstruction - Ignore bundling of pseudo instructions.
> +  bool ignorePseudoInstruction(MachineInstr *MI, MachineBasicBlock *MBB) {
> +    return false;
> +  }
> +
> +  // isSoloInstruction - return true if instruction MI can not be packetized
> +  // with any other instruction, which means that MI itself is a packet.
> +  bool isSoloInstruction(MachineInstr *MI) {
> +    if (TII->isVector(*MI))
> +      return true;
> +    if (!TII->isALUInstr(MI->getOpcode()))
> +      return true;
> +    if (TII->get(MI->getOpcode()).TSFlags & R600_InstFlag::TRANS_ONLY)
> +      return true;
> +    if (TII->isTransOnly(MI))
> +      return true;
> +    return false;
> +  }
> +
> +  // isLegalToPacketizeTogether - Is it legal to packetize SUI and SUJ
> +  // together.
> +  bool isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
> +    if (getSlot(SUI->getInstr()) <= getSlot(SUJ->getInstr()))
> +      return false;
> +    if (SUJ->isSucc(SUI)) {
> +      for (unsigned i = 0, e = SUJ->Succs.size(); i < e; ++i) {
> +        if (SUJ->Succs[i].getSUnit() != SUI)
> +          continue;
> +        if (SUJ->Succs[i].getKind() != SDep::Anti)
> +          return false;
> +      }
> +    }
> +    return true;
> +  }
> +
> +  // isLegalToPruneDependencies - Is it legal to prune dependece between SUI
> +  // and SUJ.
> +  bool isLegalToPruneDependencies(SUnit *SUI, SUnit *SUJ) {return false;}
> +
> +  void setIsLastBit(MachineInstr *MI, unsigned Bit) const {
> +    unsigned LastOp = TII->getOperandIdx(MI->getOpcode(), R600Operands::LAST);
> +    MI->getOperand(LastOp).setImm(Bit);
> +  }
> +
> +  MachineBasicBlock::iterator addToPacket(MachineInstr *MI) {
> +    CurrentPacketMIs.push_back(MI);
> +    bool FitsConstLimits = TII->canBundle(CurrentPacketMIs);
> +    DEBUG(
> +      if (!FitsConstLimits) {
> +        dbgs() << "Couldn't pack :\n";
> +        MI->dump();
> +        dbgs() << "with the following packets :\n";
> +        for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
> +          CurrentPacketMIs[i]->dump();
> +          dbgs() << "\n";
> +        }
> +        dbgs() << "because of Consts read limitations\n";
> +      });
> +    bool FitsReadPortLimits = fitsReadPortLimitation(CurrentPacketMIs);
> +    DEBUG(
> +      if (!FitsReadPortLimits) {
> +        dbgs() << "Couldn't pack :\n";
> +        MI->dump();
> +        dbgs() << "with the following packets :\n";
> +        for (unsigned i = 0, e = CurrentPacketMIs.size() - 1; i < e; i++) {
> +          CurrentPacketMIs[i]->dump();
> +          dbgs() << "\n";
> +        }
> +        dbgs() << "because of Read port limitations\n";
> +      });
> +    bool isBundlable = FitsConstLimits && FitsReadPortLimits;
> +    CurrentPacketMIs.pop_back();
> +    if (!isBundlable) {
> +      endPacket(MI->getParent(), MI);
> +      return VLIWPacketizerList::addToPacket(MI);
> +    }
> +    if (!CurrentPacketMIs.empty())
> +      setIsLastBit(CurrentPacketMIs.back(), 0);
> +    return VLIWPacketizerList::addToPacket(MI);
> +  }
> +private:
> +  std::vector<std::pair<int, unsigned> >
> +  ExtractSrcs(const MachineInstr *MI) const {
> +    R600Operands::Ops Ops[] = {
> +      R600Operands::SRC0,
> +      R600Operands::SRC1,
> +      R600Operands::SRC2
> +    };
> +    std::vector<std::pair<int, unsigned> > Result;
> +    for (unsigned i = 0; i < 3; i++) {
> +      int OperandIdx = TII->getOperandIdx(MI->getOpcode(), Ops[i]);
> +      if (OperandIdx < 0){
> +        Result.push_back(std::pair<int, unsigned>(-1,0));
> +        continue;
> +      }
> +      unsigned Src = MI->getOperand(OperandIdx).getReg();
> +      unsigned Reg = TRI.getEncodingValue(Src) & 0xff;
> +      if (Reg > 127) {
> +        Result.push_back(std::pair<int, unsigned>(-1,0));
> +        continue;
> +      }
> +      unsigned Chan = TRI.getHWRegChan(Src);
> +      Result.push_back(std::pair<int, unsigned>(Reg, Chan));
> +    }
> +    return Result;
> +  }
> +
> +  std::vector<std::pair<int, unsigned> >
> +  Swizzle(std::vector<std::pair<int, unsigned> > Src,
> +  BankSwizzle Swz) const {
> +    switch (Swz) {
> +    case ALU_VEC_012:
> +      break;
> +    case ALU_VEC_021:
> +      std::swap(Src[1], Src[2]);
> +      break;
> +    case ALU_VEC_102:
> +      std::swap(Src[0], Src[1]);
> +      break;
> +    case ALU_VEC_120:
> +      std::swap(Src[0], Src[1]);
> +      std::swap(Src[0], Src[2]);
> +      break;
> +    case ALU_VEC_201:
> +      std::swap(Src[0], Src[2]);
> +      std::swap(Src[0], Src[1]);
> +      break;
> +    case ALU_VEC_210:
> +      std::swap(Src[0], Src[2]);
> +      break;
> +    }
> +    return Src;
> +  }
> +
> +  bool isLegal(const std::vector<MachineInstr *> &IG,
> +      const std::vector<BankSwizzle> &Swz) const {
> +    assert (Swz.size() == IG.size());
> +    int Vector[4][3];
> +    memset(Vector, -1, sizeof(Vector));
> +    for (unsigned i = 0, e = IG.size(); i < e; i++) {
> +      const std::vector<std::pair<int, unsigned> > &Srcs =
> +          Swizzle(ExtractSrcs(IG[i]), Swz[i]);
> +      for (unsigned j = 0; j < 3; j++) {
> +        const std::pair<int, unsigned> &Src = Srcs[j];
> +        if (Src.first < 0)
> +          continue;
> +        if (Vector[Src.second][j] < 0)
> +          Vector[Src.second][j] = Src.first;
> +        if (Vector[Src.second][j] != Src.first)
> +          return false;
> +      }
> +    }
> +    return true;
> +  }
> +
> +  bool recursiveFitsFPLimitation(
> +  std::vector<MachineInstr *> IG,
> +  std::vector<BankSwizzle> &SwzCandidate,
> +  std::vector<MachineInstr *> CurrentlyChecked)
> +      const {
> +    if (!isLegal(CurrentlyChecked, SwzCandidate))
> +      return false;
> +    if (IG.size() == CurrentlyChecked.size()) {
> +      return true;
> +    }
> +    BankSwizzle AvailableSwizzle[] = {
> +      ALU_VEC_012,
> +      ALU_VEC_021,
> +      ALU_VEC_120,
> +      ALU_VEC_102,
> +      ALU_VEC_201,
> +      ALU_VEC_210
> +    };
> +    CurrentlyChecked.push_back(IG[CurrentlyChecked.size()]);
> +    for (unsigned i = 0; i < 6; i++) {
> +      SwzCandidate.push_back(AvailableSwizzle[i]);
> +      if (recursiveFitsFPLimitation(IG, SwzCandidate, CurrentlyChecked))
> +        return true;
> +      SwzCandidate.pop_back();
> +    }
> +    return false;
> +  }
> +
> +  bool fitsReadPortLimitation(
> +  std::vector<MachineInstr *> IG)
> +      const {
> +    std::vector<BankSwizzle> SwzCandidate;
> +    bool Result = recursiveFitsFPLimitation(IG, SwzCandidate,
> +        std::vector<MachineInstr *>());
> +    if (!Result)
> +      return false;
> +    for (unsigned i = 0, e = IG.size(); i < e; i++) {
> +      MachineInstr *MI = IG[i];
> +      unsigned Op = TII->getOperandIdx(MI->getOpcode(),
> +          R600Operands::BANK_SWIZZLE);
> +      MI->getOperand(Op).setImm(SwzCandidate[i]);
> +    }
> +    return true;
> +  }
> +};
> +
> +bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
> +  const TargetInstrInfo *TII = Fn.getTarget().getInstrInfo();
> +  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
> +  MachineDominatorTree &MDT = getAnalysis<MachineDominatorTree>();
> +
> +  // Instantiate the packetizer.
> +  R600PacketizerList Packetizer(Fn, MLI, MDT);
> +
> +  // DFA state table should not be empty.
> +  assert(Packetizer.getResourceTracker() && "Empty DFA table!");
> +
> +  //
> +  // Loop over all basic blocks and remove KILL pseudo-instructions
> +  // These instructions confuse the dependence analysis. Consider:
> +  // D0 = ...   (Insn 0)
> +  // R0 = KILL R0, D0 (Insn 1)
> +  // R0 = ... (Insn 2)
> +  // Here, Insn 1 will result in the dependence graph not emitting an output
> +  // dependence between Insn 0 and Insn 2. This can lead to incorrect
> +  // packetization
> +  //
> +  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
> +       MBB != MBBe; ++MBB) {
> +    MachineBasicBlock::iterator End = MBB->end();
> +    MachineBasicBlock::iterator MI = MBB->begin();
> +    while (MI != End) {
> +      if (MI->isKill()) {
> +        MachineBasicBlock::iterator DeleteMI = MI;
> +        ++MI;
> +        MBB->erase(DeleteMI);
> +        End = MBB->end();
> +        continue;
> +      }
> +      ++MI;
> +    }
> +  }
> +
> +  // Loop over all of the basic blocks.
> +  for (MachineFunction::iterator MBB = Fn.begin(), MBBe = Fn.end();
> +       MBB != MBBe; ++MBB) {
> +    // Find scheduling regions and schedule / packetize each region.
> +    unsigned RemainingCount = MBB->size();
> +    for(MachineBasicBlock::iterator RegionEnd = MBB->end();
> +        RegionEnd != MBB->begin();) {
> +      // The next region starts above the previous region. Look backward in the
> +      // instruction stream until we find the nearest boundary.
> +      MachineBasicBlock::iterator I = RegionEnd;
> +      for(;I != MBB->begin(); --I, --RemainingCount) {
> +        if (TII->isSchedulingBoundary(llvm::prior(I), MBB, Fn))
> +          break;
> +      }
> +      I = MBB->begin();
> +
> +      // Skip empty scheduling regions.
> +      if (I == RegionEnd) {
> +        RegionEnd = llvm::prior(RegionEnd);
> +        --RemainingCount;
> +        continue;
> +      }
> +      // Skip regions with one instruction.
> +      if (I == llvm::prior(RegionEnd)) {
> +        RegionEnd = llvm::prior(RegionEnd);
> +        continue;
> +      }
> +
> +      Packetizer.PacketizeMIs(MBB, I, RegionEnd);
> +      RegionEnd = I;
> +    }
> +  }
> +
> +  return true;
> +
> +}
> +
> +}
> +
> +llvm::FunctionPass *llvm::createR600Packetizer(TargetMachine &tm) {
> +  return new R600Packetizer(tm);
> +}
> +
> +#endif // R600PACKETIZER_CPP
> -- 
> 1.8.1.4
> 

> _______________________________________________
> llvm-commits mailing list
> llvm-commits at cs.uiuc.edu
> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits