R600: Various Fixes

Tom Stellard tom at stellard.net
Tue Jul 9 07:19:24 PDT 2013


On Tue, Jul 09, 2013 at 06:34:42AM -0700, Vincent Lejeune wrote:
> Patch 2 can be a good candidate for the stable branch,

Ok, can you mark it as a candidate before you commit

> Patch 5 too (updated version attached) but I'm not sure it will apply cleanly.
> 
> 

Patch 5 is:
Reviewed-by: Tom Stellard <thomas.stellard at amd.com>

Go ahead and mark this as a stable candidate too, I will try to backport
it.

-Tom

> 
> 
> ----- Mail original -----
> > De : Tom Stellard <tom at stellard.net>
> > À : Vincent Lejeune <vljn at ovi.com>
> > Cc : "llvm-commits at cs.uiuc.edu" <llvm-commits at cs.uiuc.edu>
> > Envoyé le : Lundi 8 juillet 2013 18h11
> > Objet : Re: R600: Various Fixes
> > 
> > On Sun, Jul 07, 2013 at 07:34:27AM -0700, Vincent Lejeune wrote:
> >>  Hi,
> >> 
> >>  This patches serie fixes some bugs uncovered by running some webgl samples.
> >> 
> >>  Patches 1, 3, 4 fix 2 bugs in the VectorMerger pass.
> >>  Patch 2 makes fcos/fsin works for all values, and not only the one between 
> > -Pi and Pi.
> >>  Last patch rework the predication we're doing by taking clause marker 
> > into account.
> >>  Currently we loose some predicate opportunities (all machine block with 
> > Kcache constant
> >>  becomes unpredicable atm, this can be solved later but need the 
> > isPredicableFunction to accept
> >>  an extra argument) but the pass output should be correct in most 
> > situations.
> >> 
> >> 
> > 
> > Patches 1-4 are:
> > 
> > Reviewed-by: Tom Stellard <thomas.stellard at amd.com>
> > 
> > Are any of these candidates for stable?
> > 
> > -Tom
> > 
> >>  From cc427492b38d672f33ffd13ac3762d8064476663 Mon Sep 17 00:00:00 2001
> >>  From: Vincent Lejeune <vljn at ovi.com>
> >>  Date: Wed, 3 Jul 2013 21:25:38 +0200
> >>  Subject: [PATCH 5/5] R600: Do not predicated basic block with multiple alu
> >>   clause
> >> 
> >>  Test is not included as it is several 1000 lines long.
> >>  To test this functionnality, a test case must generate at least 2 ALU 
> > clauses,
> >>  where an ALU clause is ~110 instructions long.
> >>  ---
> >>   lib/Target/R600/AMDGPUTargetMachine.cpp      |  3 +-
> >>   lib/Target/R600/R600ControlFlowFinalizer.cpp |  2 ++
> >>   lib/Target/R600/R600EmitClauseMarkers.cpp    | 10 ++++---
> >>   lib/Target/R600/R600InstrInfo.cpp            | 45 
> > ++++++++++++++++++++++++++++
> >>   lib/Target/R600/R600Instructions.td          |  2 +-
> >>   lib/Target/R600/R600Packetizer.cpp           |  3 +-
> >>   test/CodeGen/R600/jump-address.ll            |  2 +-
> >>   7 files changed, 58 insertions(+), 9 deletions(-)
> >> 
> >>  diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp 
> > b/lib/Target/R600/AMDGPUTargetMachine.cpp
> >>  index 90f72de..59b1fd6 100644
> >>  --- a/lib/Target/R600/AMDGPUTargetMachine.cpp
> >>  +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
> >>  @@ -148,7 +148,7 @@ bool AMDGPUPassConfig::addPostRegAlloc() {
> >>   }
> >>   
> >>   bool AMDGPUPassConfig::addPreSched2() {
> >>  -
> >>  +  addPass(createR600EmitClauseMarkers(*TM));
> > 
> > Does adding this pass here mean it will be run for SI too?
> > 
> >>     addPass(&IfConverterID);
> >>     return false;
> >>   }
> >>  @@ -158,7 +158,6 @@ bool AMDGPUPassConfig::addPreEmitPass() {
> >>     if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
> >>       addPass(createAMDGPUCFGPreparationPass(*TM));
> >>       addPass(createAMDGPUCFGStructurizerPass(*TM));
> >>  -    addPass(createR600EmitClauseMarkers(*TM));
> >>       addPass(createR600ExpandSpecialInstrsPass(*TM));
> >>       addPass(&FinalizeMachineBundlesID);
> >>       addPass(createR600Packetizer(*TM));
> >>  diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp 
> > b/lib/Target/R600/R600ControlFlowFinalizer.cpp
> >>  index 887c808..932a6a7 100644
> >>  --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
> >>  +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
> >>  @@ -256,6 +256,7 @@ private:
> >>           ClauseContent.push_back(MILit);
> >>         }
> >>       }
> >>  +    assert(ClauseContent.size() < 128 && "ALU clause is 
> > too big");
> >>       ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
> >>       return ClauseFile(ClauseHead, ClauseContent);
> >>     }
> >>  @@ -276,6 +277,7 @@ private:
> >>     void
> >>     EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile 
> > &Clause,
> >>         unsigned &CfCount) {
> >>  +    Clause.first->getOperand(0).setImm(0);
> >>       CounterPropagateAddr(Clause.first, CfCount);
> >>       MachineBasicBlock *BB = Clause.first->getParent();
> >>       BuildMI(BB, InsertPos->getDebugLoc(), 
> > TII->get(AMDGPU::ALU_CLAUSE))
> >>  diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp 
> > b/lib/Target/R600/R600EmitClauseMarkers.cpp
> >>  index 0aea2d7..8056634 100644
> >>  --- a/lib/Target/R600/R600EmitClauseMarkers.cpp
> >>  +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
> >>  @@ -32,6 +32,7 @@ class R600EmitClauseMarkersPass : public 
> > MachineFunctionPass {
> >>   private:
> >>     static char ID;
> >>     const R600InstrInfo *TII;
> >>  +  int address;
> > 
> > Coding Style address should be capitalized.
> >>   
> >>     unsigned OccupiedDwords(MachineInstr *MI) const {
> >>       switch (MI->getOpcode()) {
> >>  @@ -159,7 +160,7 @@ private:
> >>     }
> >>   
> >>     MachineBasicBlock::iterator
> >>  -  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) 
> > const {
> >>  +  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) 
> > {
> >>       MachineBasicBlock::iterator ClauseHead = I;
> >>       std::vector<std::pair<unsigned, unsigned> > KCacheBanks;
> >>       bool PushBeforeModifier = false;
> >>  @@ -199,20 +200,21 @@ private:
> >>       unsigned Opcode = PushBeforeModifier ?
> >>           AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
> >>       BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), 
> > TII->get(Opcode))
> >>  -        .addImm(0) // ADDR
> >>  +        .addImm(address++) // ADDR
> >>           .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0
> >>           .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1
> >>           .addImm(KCacheBanks.empty()?0:2) // KM0
> >>           .addImm((KCacheBanks.size() < 2)?0:2) // KM1
> >>           .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0
> >>           .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // 
> > KLINE1
> >>  -        .addImm(AluInstCount); // COUNT
> >>  +        .addImm(AluInstCount) // COUNT
> >>  +        .addImm(1); // Enabled
> >>       return I;
> >>     }
> >>   
> >>   public:
> >>     R600EmitClauseMarkersPass(TargetMachine &tm) : 
> > MachineFunctionPass(ID),
> >>  -    TII(0) { }
> >>  +    TII(0), address(0) { }
> >>   
> >>     virtual bool runOnMachineFunction(MachineFunction &MF) {
> >>       TII = static_cast<const R600InstrInfo 
> > *>(MF.getTarget().getInstrInfo());
> >>  diff --git a/lib/Target/R600/R600InstrInfo.cpp 
> > b/lib/Target/R600/R600InstrInfo.cpp
> >>  index 974a579..8d622d6 100644
> >>  --- a/lib/Target/R600/R600InstrInfo.cpp
> >>  +++ b/lib/Target/R600/R600InstrInfo.cpp
> >>  @@ -650,6 +650,17 @@ int R600InstrInfo::getBranchInstr(const MachineOperand 
> > &op) const {
> >>     };
> >>   }
> >>   
> >>  +static
> >>  +MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) 
> > {
> >>  +  for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = 
> > MBB.rend();
> >>  +      It != E; ++It) {
> >>  +    if (It->getOpcode() == AMDGPU::CF_ALU ||
> >>  +        It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
> >>  +      return llvm::prior(It.base());
> >>  +  }
> >>  +  return MBB.end();
> >>  +}
> >>  +
> >>   unsigned
> >>   R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
> >>                               MachineBasicBlock *TBB,
> >>  @@ -671,6 +682,11 @@ R600InstrInfo::InsertBranch(MachineBasicBlock 
> > &MBB,
> >>         BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
> >>                .addMBB(TBB)
> >>                .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
> >>  +      MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
> >>  +      if (CfAlu == MBB.end())
> >>  +        return 1;
> >>  +      assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
> >>  +      CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
> >>         return 1;
> >>       }
> >>     } else {
> >>  @@ -682,6 +698,11 @@ R600InstrInfo::InsertBranch(MachineBasicBlock 
> > &MBB,
> >>               .addMBB(TBB)
> >>               .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
> >>       BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB);
> >>  +    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
> >>  +    if (CfAlu == MBB.end())
> >>  +      return 2;
> >>  +    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
> >>  +    CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
> >>       return 2;
> >>     }
> >>   }
> >>  @@ -705,6 +726,11 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock 
> > &MBB) const {
> >>       MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
> >>       clearFlag(predSet, 0, MO_FLAG_PUSH);
> >>       I->eraseFromParent();
> >>  +    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
> >>  +    if (CfAlu == MBB.end())
> >>  +      break;
> >>  +    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
> >>  +    CfAlu->setDesc(get(AMDGPU::CF_ALU));
> >>       break;
> >>     }
> >>     case AMDGPU::JUMP:
> >>  @@ -725,6 +751,11 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock 
> > &MBB) const {
> >>       MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
> >>       clearFlag(predSet, 0, MO_FLAG_PUSH);
> >>       I->eraseFromParent();
> >>  +    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
> >>  +    if (CfAlu == MBB.end())
> >>  +      break;
> >>  +    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
> >>  +    CfAlu->setDesc(get(AMDGPU::CF_ALU));
> >>       break;
> >>     }
> >>     case AMDGPU::JUMP:
> >>  @@ -759,6 +790,15 @@ R600InstrInfo::isPredicable(MachineInstr *MI) const {
> >>   
> >>     if (MI->getOpcode() == AMDGPU::KILLGT) {
> >>       return false;
> >>  +  } else if (MI->getOpcode() == AMDGPU::CF_ALU) {
> >>  +    // If the clause start in the middle of MBB then the MBB has more
> >>  +    // than a single clause, unable to predicate several clauses.
> >>  +    if (MI->getParent()->begin() != MachineBasicBlock::iterator(MI))
> >>  +      return false;
> >>  +    // TODO: We don't support KC merging atm
> >>  +    if (MI->getOperand(3).getImm() != 0 || 
> > MI->getOperand(4).getImm() != 0)
> >>  +      return false;
> >>  +    return true;
> >>     } else if (isVector(*MI)) {
> >>       return false;
> >>     } else {
> >>  @@ -854,6 +894,11 @@ R600InstrInfo::PredicateInstruction(MachineInstr *MI,
> >>                         const SmallVectorImpl<MachineOperand> 
> > &Pred) const {
> >>     int PIdx = MI->findFirstPredOperandIdx();
> >>   
> >>  +  if (MI->getOpcode() == AMDGPU::CF_ALU) {
> >>  +    MI->getOperand(8).setImm(0);
> >>  +    return true;
> >>  +  }
> >>  +
> >>     if (PIdx != -1) {
> >>       MachineOperand &PMO = MI->getOperand(PIdx);
> >>       PMO.setReg(Pred[2].getReg());
> >>  diff --git a/lib/Target/R600/R600Instructions.td 
> > b/lib/Target/R600/R600Instructions.td
> >>  index 735dcfc..df5c438 100644
> >>  --- a/lib/Target/R600/R600Instructions.td
> >>  +++ b/lib/Target/R600/R600Instructions.td
> >>  @@ -563,7 +563,7 @@ class ALU_CLAUSE<bits<4> inst, string 
> > OpName> : AMDGPUInst <(outs),
> >>   (ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1,
> >>   KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1,
> >>   i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1,
> >>  -i32imm:$COUNT),
> >>  +i32imm:$COUNT, i32imm:$Enabled),
> >>   !strconcat(OpName, " $COUNT, @$ADDR, "
> >>   "KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"),
> >>   [] >, CF_ALU_WORD0, CF_ALU_WORD1 {
> >>  diff --git a/lib/Target/R600/R600Packetizer.cpp 
> > b/lib/Target/R600/R600Packetizer.cpp
> >>  index 5ee51fa..f4219bd 100644
> >>  --- a/lib/Target/R600/R600Packetizer.cpp
> >>  +++ b/lib/Target/R600/R600Packetizer.cpp
> >>  @@ -304,7 +304,8 @@ bool 
> > R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
> >>       MachineBasicBlock::iterator End = MBB->end();
> >>       MachineBasicBlock::iterator MI = MBB->begin();
> >>       while (MI != End) {
> >>  -      if (MI->isKill()) {
> >>  +      if (MI->isKill() ||
> >>  +          (MI->getOpcode() == AMDGPU::CF_ALU && 
> > !MI->getOperand(8).getImm())) {
> >>           MachineBasicBlock::iterator DeleteMI = MI;
> >>           ++MI;
> >>           MBB->erase(DeleteMI);
> >>  diff --git a/test/CodeGen/R600/jump-address.ll 
> > b/test/CodeGen/R600/jump-address.ll
> >>  index ae9c8bb..9a5f1bc 100644
> >>  --- a/test/CodeGen/R600/jump-address.ll
> >>  +++ b/test/CodeGen/R600/jump-address.ll
> >>  @@ -1,6 +1,6 @@
> >>   ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
> >>   
> >>  -; CHECK: JUMP @3
> >>  +; CHECK: JUMP @7
> >>   ; CHECK: EXPORT
> >>   ; CHECK-NOT: EXPORT
> >>   
> >>  -- 
> >>  1.8.3.1
> >> 
> > 
> >>  _______________________________________________
> >>  llvm-commits mailing list
> >>  llvm-commits at cs.uiuc.edu
> >>  http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> > 

> From 8cf2e33a07816f650585212e2158fb17b70c1ca9 Mon Sep 17 00:00:00 2001
> From: Vincent Lejeune <vljn at ovi.com>
> Date: Wed, 3 Jul 2013 21:25:38 +0200
> Subject: [PATCH] R600: Do not predicated basic block with multiple alu clause
> 
> Test is not included as it is several 1000 lines long.
> To test this functionnality, a test case must generate at least 2 ALU clauses,
> where an ALU clause is ~110 instructions long.
> ---
>  lib/Target/R600/AMDGPUTargetMachine.cpp      |  5 +++-
>  lib/Target/R600/R600ControlFlowFinalizer.cpp |  2 ++
>  lib/Target/R600/R600EmitClauseMarkers.cpp    | 14 ++++++---
>  lib/Target/R600/R600InstrInfo.cpp            | 45 ++++++++++++++++++++++++++++
>  lib/Target/R600/R600Instructions.td          |  2 +-
>  lib/Target/R600/R600Packetizer.cpp           |  3 +-
>  test/CodeGen/R600/jump-address.ll            |  2 +-
>  7 files changed, 65 insertions(+), 8 deletions(-)
> 
> diff --git a/lib/Target/R600/AMDGPUTargetMachine.cpp b/lib/Target/R600/AMDGPUTargetMachine.cpp
> index 90f72de..7a14e50 100644
> --- a/lib/Target/R600/AMDGPUTargetMachine.cpp
> +++ b/lib/Target/R600/AMDGPUTargetMachine.cpp
> @@ -148,7 +148,11 @@ bool AMDGPUPassConfig::addPostRegAlloc() {
>  }
>  
>  bool AMDGPUPassConfig::addPreSched2() {
> +  const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>();
>  
> +  if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
> +    addPass(createR600EmitClauseMarkers(*TM));
> +  }
>    addPass(&IfConverterID);
>    return false;
>  }
> @@ -158,7 +162,6 @@ bool AMDGPUPassConfig::addPreEmitPass() {
>    if (ST.getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
>      addPass(createAMDGPUCFGPreparationPass(*TM));
>      addPass(createAMDGPUCFGStructurizerPass(*TM));
> -    addPass(createR600EmitClauseMarkers(*TM));
>      addPass(createR600ExpandSpecialInstrsPass(*TM));
>      addPass(&FinalizeMachineBundlesID);
>      addPass(createR600Packetizer(*TM));
> diff --git a/lib/Target/R600/R600ControlFlowFinalizer.cpp b/lib/Target/R600/R600ControlFlowFinalizer.cpp
> index 887c808..932a6a7 100644
> --- a/lib/Target/R600/R600ControlFlowFinalizer.cpp
> +++ b/lib/Target/R600/R600ControlFlowFinalizer.cpp
> @@ -256,6 +256,7 @@ private:
>          ClauseContent.push_back(MILit);
>        }
>      }
> +    assert(ClauseContent.size() < 128 && "ALU clause is too big");
>      ClauseHead->getOperand(7).setImm(ClauseContent.size() - 1);
>      return ClauseFile(ClauseHead, ClauseContent);
>    }
> @@ -276,6 +277,7 @@ private:
>    void
>    EmitALUClause(MachineBasicBlock::iterator InsertPos, ClauseFile &Clause,
>        unsigned &CfCount) {
> +    Clause.first->getOperand(0).setImm(0);
>      CounterPropagateAddr(Clause.first, CfCount);
>      MachineBasicBlock *BB = Clause.first->getParent();
>      BuildMI(BB, InsertPos->getDebugLoc(), TII->get(AMDGPU::ALU_CLAUSE))
> diff --git a/lib/Target/R600/R600EmitClauseMarkers.cpp b/lib/Target/R600/R600EmitClauseMarkers.cpp
> index 0aea2d7..c1da64c 100644
> --- a/lib/Target/R600/R600EmitClauseMarkers.cpp
> +++ b/lib/Target/R600/R600EmitClauseMarkers.cpp
> @@ -32,6 +32,7 @@ class R600EmitClauseMarkersPass : public MachineFunctionPass {
>  private:
>    static char ID;
>    const R600InstrInfo *TII;
> +  int Address;
>  
>    unsigned OccupiedDwords(MachineInstr *MI) const {
>      switch (MI->getOpcode()) {
> @@ -159,7 +160,7 @@ private:
>    }
>  
>    MachineBasicBlock::iterator
> -  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
> +  MakeALUClause(MachineBasicBlock &MBB, MachineBasicBlock::iterator I) {
>      MachineBasicBlock::iterator ClauseHead = I;
>      std::vector<std::pair<unsigned, unsigned> > KCacheBanks;
>      bool PushBeforeModifier = false;
> @@ -199,20 +200,25 @@ private:
>      unsigned Opcode = PushBeforeModifier ?
>          AMDGPU::CF_ALU_PUSH_BEFORE : AMDGPU::CF_ALU;
>      BuildMI(MBB, ClauseHead, MBB.findDebugLoc(ClauseHead), TII->get(Opcode))
> -        .addImm(0) // ADDR
> +    // We don't use the ADDR field until R600ControlFlowFinalizer pass, where
> +    // it is safe to assume it is 0. However if we always put 0 here, the ifcvt
> +    // pass may assume that identical ALU clause starter at the beginning of a 
> +    // true and false branch can be factorized which is not the case.
> +        .addImm(Address++) // ADDR
>          .addImm(KCacheBanks.empty()?0:KCacheBanks[0].first) // KB0
>          .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].first) // KB1
>          .addImm(KCacheBanks.empty()?0:2) // KM0
>          .addImm((KCacheBanks.size() < 2)?0:2) // KM1
>          .addImm(KCacheBanks.empty()?0:KCacheBanks[0].second) // KLINE0
>          .addImm((KCacheBanks.size() < 2)?0:KCacheBanks[1].second) // KLINE1
> -        .addImm(AluInstCount); // COUNT
> +        .addImm(AluInstCount) // COUNT
> +        .addImm(1); // Enabled
>      return I;
>    }
>  
>  public:
>    R600EmitClauseMarkersPass(TargetMachine &tm) : MachineFunctionPass(ID),
> -    TII(0) { }
> +    TII(0), Address(0) { }
>  
>    virtual bool runOnMachineFunction(MachineFunction &MF) {
>      TII = static_cast<const R600InstrInfo *>(MF.getTarget().getInstrInfo());
> diff --git a/lib/Target/R600/R600InstrInfo.cpp b/lib/Target/R600/R600InstrInfo.cpp
> index 969a7ce..d0935fa 100644
> --- a/lib/Target/R600/R600InstrInfo.cpp
> +++ b/lib/Target/R600/R600InstrInfo.cpp
> @@ -651,6 +651,17 @@ int R600InstrInfo::getBranchInstr(const MachineOperand &op) const {
>    };
>  }
>  
> +static
> +MachineBasicBlock::iterator FindLastAluClause(MachineBasicBlock &MBB) {
> +  for (MachineBasicBlock::reverse_iterator It = MBB.rbegin(), E = MBB.rend();
> +      It != E; ++It) {
> +    if (It->getOpcode() == AMDGPU::CF_ALU ||
> +        It->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE)
> +      return llvm::prior(It.base());
> +  }
> +  return MBB.end();
> +}
> +
>  unsigned
>  R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
>                              MachineBasicBlock *TBB,
> @@ -672,6 +683,11 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
>        BuildMI(&MBB, DL, get(AMDGPU::JUMP_COND))
>               .addMBB(TBB)
>               .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
> +      MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
> +      if (CfAlu == MBB.end())
> +        return 1;
> +      assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
> +      CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
>        return 1;
>      }
>    } else {
> @@ -683,6 +699,11 @@ R600InstrInfo::InsertBranch(MachineBasicBlock &MBB,
>              .addMBB(TBB)
>              .addReg(AMDGPU::PREDICATE_BIT, RegState::Kill);
>      BuildMI(&MBB, DL, get(AMDGPU::JUMP)).addMBB(FBB);
> +    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
> +    if (CfAlu == MBB.end())
> +      return 2;
> +    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU);
> +    CfAlu->setDesc(get(AMDGPU::CF_ALU_PUSH_BEFORE));
>      return 2;
>    }
>  }
> @@ -706,6 +727,11 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
>      MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
>      clearFlag(predSet, 0, MO_FLAG_PUSH);
>      I->eraseFromParent();
> +    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
> +    if (CfAlu == MBB.end())
> +      break;
> +    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
> +    CfAlu->setDesc(get(AMDGPU::CF_ALU));
>      break;
>    }
>    case AMDGPU::JUMP:
> @@ -726,6 +752,11 @@ R600InstrInfo::RemoveBranch(MachineBasicBlock &MBB) const {
>      MachineInstr *predSet = findFirstPredicateSetterFrom(MBB, I);
>      clearFlag(predSet, 0, MO_FLAG_PUSH);
>      I->eraseFromParent();
> +    MachineBasicBlock::iterator CfAlu = FindLastAluClause(MBB);
> +    if (CfAlu == MBB.end())
> +      break;
> +    assert (CfAlu->getOpcode() == AMDGPU::CF_ALU_PUSH_BEFORE);
> +    CfAlu->setDesc(get(AMDGPU::CF_ALU));
>      break;
>    }
>    case AMDGPU::JUMP:
> @@ -760,6 +791,15 @@ R600InstrInfo::isPredicable(MachineInstr *MI) const {
>  
>    if (MI->getOpcode() == AMDGPU::KILLGT) {
>      return false;
> +  } else if (MI->getOpcode() == AMDGPU::CF_ALU) {
> +    // If the clause start in the middle of MBB then the MBB has more
> +    // than a single clause, unable to predicate several clauses.
> +    if (MI->getParent()->begin() != MachineBasicBlock::iterator(MI))
> +      return false;
> +    // TODO: We don't support KC merging atm
> +    if (MI->getOperand(3).getImm() != 0 || MI->getOperand(4).getImm() != 0)
> +      return false;
> +    return true;
>    } else if (isVector(*MI)) {
>      return false;
>    } else {
> @@ -855,6 +895,11 @@ R600InstrInfo::PredicateInstruction(MachineInstr *MI,
>                        const SmallVectorImpl<MachineOperand> &Pred) const {
>    int PIdx = MI->findFirstPredOperandIdx();
>  
> +  if (MI->getOpcode() == AMDGPU::CF_ALU) {
> +    MI->getOperand(8).setImm(0);
> +    return true;
> +  }
> +
>    if (PIdx != -1) {
>      MachineOperand &PMO = MI->getOperand(PIdx);
>      PMO.setReg(Pred[2].getReg());
> diff --git a/lib/Target/R600/R600Instructions.td b/lib/Target/R600/R600Instructions.td
> index 735dcfc..df5c438 100644
> --- a/lib/Target/R600/R600Instructions.td
> +++ b/lib/Target/R600/R600Instructions.td
> @@ -563,7 +563,7 @@ class ALU_CLAUSE<bits<4> inst, string OpName> : AMDGPUInst <(outs),
>  (ins i32imm:$ADDR, i32imm:$KCACHE_BANK0, i32imm:$KCACHE_BANK1,
>  KCACHE:$KCACHE_MODE0, KCACHE:$KCACHE_MODE1,
>  i32imm:$KCACHE_ADDR0, i32imm:$KCACHE_ADDR1,
> -i32imm:$COUNT),
> +i32imm:$COUNT, i32imm:$Enabled),
>  !strconcat(OpName, " $COUNT, @$ADDR, "
>  "KC0[$KCACHE_MODE0], KC1[$KCACHE_MODE1]"),
>  [] >, CF_ALU_WORD0, CF_ALU_WORD1 {
> diff --git a/lib/Target/R600/R600Packetizer.cpp b/lib/Target/R600/R600Packetizer.cpp
> index 5ee51fa..f4219bd 100644
> --- a/lib/Target/R600/R600Packetizer.cpp
> +++ b/lib/Target/R600/R600Packetizer.cpp
> @@ -304,7 +304,8 @@ bool R600Packetizer::runOnMachineFunction(MachineFunction &Fn) {
>      MachineBasicBlock::iterator End = MBB->end();
>      MachineBasicBlock::iterator MI = MBB->begin();
>      while (MI != End) {
> -      if (MI->isKill()) {
> +      if (MI->isKill() ||
> +          (MI->getOpcode() == AMDGPU::CF_ALU && !MI->getOperand(8).getImm())) {
>          MachineBasicBlock::iterator DeleteMI = MI;
>          ++MI;
>          MBB->erase(DeleteMI);
> diff --git a/test/CodeGen/R600/jump-address.ll b/test/CodeGen/R600/jump-address.ll
> index ae9c8bb..9a5f1bc 100644
> --- a/test/CodeGen/R600/jump-address.ll
> +++ b/test/CodeGen/R600/jump-address.ll
> @@ -1,6 +1,6 @@
>  ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
>  
> -; CHECK: JUMP @3
> +; CHECK: JUMP @7
>  ; CHECK: EXPORT
>  ; CHECK-NOT: EXPORT
>  
> -- 
> 1.8.3.1
> 





More information about the llvm-commits mailing list