[PATCH] R600: Clean if/then/else handling code in AMDILCFGStructurizer

Tue Nov 19 20:08:48 PST 2013

On Tue, Nov 19, 2013 at 05:36:42AM -0800, Vincent Lejeune wrote:
> ----- Mail original -----
> 
> > De?: Tom Stellard <tom at stellard.net>
> > ??: reviews+D2188+public+3f8d5bc02a1045dd at llvm-reviews.chandlerc.com
> > Cc?: vljn at ovi.com; llvm-commits at cs.uiuc.edu
> > Envoy? le : Lundi 18 novembre 2013 21h09
> > Objet?: Re: [PATCH] R600: Clean if/then/else handling code in AMDILCFGStructurizer
> > 
> > Hi Vincent,
> > 
> > I've rebased your patch on top of the current master branch (which
> > includes my patches to enable the IR Structurizer), and attached it to
> > this mail.
> > 
> > Your patch regresses a test case I added in:
> > 
> > r195030
> > R600: Fix a crash in the AMDILCFGStrucurizer
> 
> structurize1.ll works fine on my computer, however structurize.ll does not (but it's older than r195030)
> Structurize.ll actually check that branches-into-if don't crash llvm, but it run with ir structurizer disabled
> whereas I assume in my patch that it is enabled. (it will probably require some change in the CHECK: sections too)
>

OK, so the LLVM test case is working fine, but the program I derived the
test from is crashing.  I will try to get the full dump for you.  In the
mean time, here is a dump from another shader that is crashing with your
patch.

-Tom

> > 
> > I think the problem may be that not all of the if statements nested
> > inside the loop are being matched by the ifPatternMatch() function.
> > 
> > The other question I have about this patch is: Why is it safe to remove
> > the code for handling branch into IF?
> 
> That's why my patch relies on the ir structurizer : the pass replaces branches into if? with predicates
> and (several) triangles cfg pattern, pretty much like you described in the comment of commit 
> a4f468f245d6e6869317007c548ee4d33ad97343rev at 192813.
> 
> If I understand correctly what StructurizeCFG does, it will convert pattern like this :
> 
> //? ? ? ? ? ? ? ? ? ? ? ? ? ? entry
> //? ? ? ? ? ? ? ? ? ? ?  /? ? ? ? ? ? ? ?  |
> //? ? ? ? ?  diamond_head? ? ?  branch_from
> //? ? ? ? ? ?  /? ? ? ? ? ?? ?? \? ? ? ? ?  |
> // diamond_false? ? ? ? diamond_true
> //? ? ? ? ? ?  \? ? ? ? ? ? ? ? ? /
> //? ? ? ? ? ? ? ? ? done
> //
> 
> into a pattern similar to that :
> 
> //? ? ? ? ? ? ? ? ?? entry
> // ? ? ??? ? ? ? ?? |? ? ? ? ? ? ? \? 
> //? ? ? ? ????????? |?????????  diamond_head 
> //? ? ? ? ? ??????? |? ? ? ? ? ? ? ? |? ??? \? ?? 
> // ? ? ? ? ? ? ? ?? |? ? ? ? ? ? ? ? | ? ? ? diamond_false
> //????????????????? | ??? ? ? ? ? ?? |???? /
> //? ? ? ? ? ? ? ? ?? extra block
> //????????????????? |???????? \
> //????????????????? |??????? branch_from
> //????????????????? |?????? /
> //???????????????? extra block
> //????????????????? |???? \
> //????????????????? |??? diamond_true
> //????????????????? |??? /
> //????????????????? done
> 
> with an additionnal predicate in diamond_true set to true in diamond_head's true path and at the end of branch_from.
> 
> > 
> > -Tom
> > 
> > On Fri, Nov 15, 2013 at 09:16:13AM -0800, Vincent Lejeune wrote:
> >>  Further simplify the pass and fix some shadertoy's sample crashes.
> >>  The pass is no longer able to copy block in the jump into if situation and 
> > rely on structurizeCFG pass.
> >> 
> >>  http://llvm-reviews.chandlerc.com/D2188
> >> 
> >>  Files:
> >> ?  lib/Target/R600/AMDILCFGStructurizer.cpp
> > 
> >>  Index: lib/Target/R600/AMDILCFGStructurizer.cpp
> >>  ===================================================================
> >>  --- lib/Target/R600/AMDILCFGStructurizer.cpp
> >>  +++ lib/Target/R600/AMDILCFGStructurizer.cpp
> >>  @@ -133,18 +133,15 @@
> >> ? 
> >> ? ? AMDGPUCFGStructurizer(TargetMachine &tm) :
> >> ? ? ? ? MachineFunctionPass(ID), TM(tm),
> >>  -? ? ? TII(static_cast<const R600InstrInfo *>(tm.getInstrInfo())),
> >>  -? ? ? TRI(&TII->getRegisterInfo()) { }
> >>  +? ? ? MLI(0), TII(0), TRI(0), FuncRep(0) { }
> >> ? 
> >> ? ?  const char *getPassName() const {
> >> ? ? ? return "AMD IL Control Flow Graph structurizer Pass";
> >> ? ? }
> >> ? 
> >> ? ? void getAnalysisUsage(AnalysisUsage &AU) const {
> >> ? ? ? AU.addPreserved<MachineFunctionAnalysis>();
> >> ? ? ? AU.addRequired<MachineFunctionAnalysis>();
> >>  -? ? AU.addRequired<MachineDominatorTree>();
> >>  -? ? AU.addRequired<MachinePostDominatorTree>();
> >> ? ? ? AU.addRequired<MachineLoopInfo>();
> >> ? ? }
> >> ? 
> >>  @@ -161,21 +158,17 @@
> >> ? ? ? OrderedBlks.clear();
> >> ? ? ? FuncRep = &MF;
> >> ? ? ? MLI = &getAnalysis<MachineLoopInfo>();
> >>  +? ? TII = static_cast<const R600InstrInfo *>(TM.getInstrInfo());
> >>  +? ? TRI = &TII->getRegisterInfo();
> >> ? ? ? DEBUG(dbgs() << "LoopInfo:\n"; 
> > PrintLoopinfo(*MLI););
> >>  -? ? MDT = &getAnalysis<MachineDominatorTree>();
> >>  -? ? DEBUG(MDT->print(dbgs(), (const llvm::Module*)0););
> >>  -? ? PDT = &getAnalysis<MachinePostDominatorTree>();
> >>  -? ? DEBUG(PDT->print(dbgs()););
> >> ? ? ? prepare();
> >> ? ? ? run();
> >> ? ? ? DEBUG(MF.dump(););
> >> ? ? ? return true;
> >> ? ? }
> >> ? 
> >> ? protected:
> >> ? ? TargetMachine &TM;
> >>  -? MachineDominatorTree *MDT;
> >>  -? MachinePostDominatorTree *PDT;
> >> ? ? MachineLoopInfo *MLI;
> >> ? ? const R600InstrInfo *TII;
> >> ? ? const AMDGPURegisterInfo *TRI;
> >>  @@ -208,12 +201,8 @@
> >> ? ? bool hasBackEdge(MachineBasicBlock *MBB) const;
> >> ? ? static unsigned getLoopDepth(MachineLoop *LoopRep);
> >> ? ? bool isRetiredBlock(MachineBasicBlock *MBB) const;
> >>  -? bool isActiveLoophead(MachineBasicBlock *MBB) const;
> >>  -? PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock 
> > *DstMBB,
> >>  -? ? ? bool AllowSideEntry = true) const;
> >> ? ? int countActiveBlock(MBBVector::const_iterator It,
> >> ? ? ? ? MBBVector::const_iterator E) const;
> >>  -? bool needMigrateBlock(MachineBasicBlock *MBB) const;
> >> ? 
> >> ? ? // Utility Functions
> >> ? ? void reversePredicateSetter(MachineBasicBlock::iterator I);
> >>  @@ -264,33 +253,16 @@
> >> ? 
> >> ? 
> >> ? ? int patternMatch(MachineBasicBlock *MBB);
> >>  -? int patternMatchGroup(MachineBasicBlock *MBB);
> >>  -? int serialPatternMatch(MachineBasicBlock *MBB);
> >>  -? int ifPatternMatch(MachineBasicBlock *MBB);
> >>  +? bool patternMatchGroup(MachineBasicBlock *MBB);
> >>  +? bool serialPatternMatch(MachineBasicBlock *MBB);
> >>  +? bool ifPatternMatch(MachineBasicBlock *MBB);
> >> ? ? int loopendPatternMatch();
> >> ? ? int mergeLoop(MachineLoop *LoopRep);
> >> ? ? int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock 
> > *LoopHeader);
> >> ? 
> >> ? ? void handleLoopcontBlock(MachineBasicBlock *ContingMBB,
> >> ? ? ? ? MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
> >> ? ? ? ? MachineLoop *ContLoop);
> >>  -? /// return true iff src1Blk->succ_size() == 0 && src1Blk and 
> > src2Blk are in
> >>  -? /// the same loop with LoopLandInfo without explicitly keeping track of
> >>  -? /// loopContBlks and loopBreakBlks, this is a method to get the 
> > information.
> >>  -? bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
> >>  -? ? ? MachineBasicBlock *Src2MBB);
> >>  -? int handleJumpintoIf(MachineBasicBlock *HeadMBB,
> >>  -? ? ? MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
> >>  -? int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
> >>  -? ? ? MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
> >>  -? int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
> >>  -? ? ? MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
> >>  -? ? ? MachineBasicBlock **LandMBBPtr);
> >>  -? void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
> >>  -? ? ? MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
> >>  -? ? ? MachineBasicBlock *LandMBB, bool Detail = false);
> >>  -? int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
> >>  -? ? ? MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
> >> ? ? void mergeSerialBlock(MachineBasicBlock *DstMBB,
> >> ? ? ? ? MachineBasicBlock *SrcMBB);
> >> ? 
> >>  @@ -326,18 +298,10 @@
> >> ? ? void removeSuccessor(MachineBasicBlock *MBB);
> >> ? ? MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
> >> ? ? ? ? MachineBasicBlock *PredMBB);
> >>  -? void migrateInstruction(MachineBasicBlock *SrcMBB,
> >>  -? ? ? MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
> >> ? ? void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
> >> ? ? void retireBlock(MachineBasicBlock *MBB);
> >> ? ? void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB = 
> > NULL);
> >> ? 
> >>  -? MachineBasicBlock 
> > *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
> >>  -? /// This is work around solution for findNearestCommonDominator not 
> > avaiable
> >>  -? /// to post dom a proper fix should go to Dominators.h.
> >>  -? MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1,
> >>  -? ? ? MachineBasicBlock *MBB2);
> >>  -
> >> ? private:
> >> ? ? MBBInfoMap BlockInfoMap;
> >> ? ? LoopLandInfoMap LLInfoMap;
> >>  @@ -380,36 +344,6 @@
> >> ? ? return (*It).second->IsRetired;
> >> ? }
> >> ? 
> >>  -bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const 
> > {
> >>  -? MachineLoop *LoopRep = MLI->getLoopFor(MBB);
> >>  -? while (LoopRep && LoopRep->getHeader() == MBB) {
> >>  -? ? MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
> >>  -? ? if(!LoopLand)
> >>  -? ? ? return true;
> >>  -? ? if (!isRetiredBlock(LoopLand))
> >>  -? ? ? return true;
> >>  -? ? LoopRep = LoopRep->getParentLoop();
> >>  -? }
> >>  -? return false;
> >>  -}
> >>  -AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
> >>  -? ? MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
> >>  -? ? bool AllowSideEntry) const {
> >>  -? assert(DstMBB);
> >>  -? if (SrcMBB == DstMBB)
> >>  -? ? return SinglePath_InPath;
> >>  -? while (SrcMBB && SrcMBB->succ_size() == 1) {
> >>  -? ? SrcMBB = *SrcMBB->succ_begin();
> >>  -? ? if (SrcMBB == DstMBB)
> >>  -? ? ? return SinglePath_InPath;
> >>  -? ? if (!AllowSideEntry && SrcMBB->pred_size() > 1)
> >>  -? ? ? return Not_SinglePath;
> >>  -? }
> >>  -? if (SrcMBB && SrcMBB->succ_size()==0)
> >>  -? ? return SinglePath_NotInPath;
> >>  -? return Not_SinglePath;
> >>  -}
> >>  -
> >> ? int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
> >> ? ? ? MBBVector::const_iterator E) const {
> >> ? ? int Count = 0;
> >>  @@ -421,18 +355,6 @@
> >> ? ? return Count;
> >> ? }
> >> ? 
> >>  -bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const 
> > {
> >>  -? unsigned BlockSizeThreshold = 30;
> >>  -? unsigned CloneInstrThreshold = 100;
> >>  -? bool MultiplePreds = MBB && (MBB->pred_size() > 1);
> >>  -
> >>  -? if(!MultiplePreds)
> >>  -? ? return false;
> >>  -? unsigned BlkSize = MBB->size();
> >>  -? return ((BlkSize > BlockSizeThreshold) &&
> >>  -? ? ? (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
> >>  -}
> >>  -
> >> ? void AMDGPUCFGStructurizer::reversePredicateSetter(
> >> ? ? ? MachineBasicBlock::iterator I) {
> >> ? ? while (I--) {
> >>  @@ -800,6 +722,7 @@
> >> ? ? bool MakeProgress = false;
> >> ? ? int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
> >> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? OrderedBlks.end());
> >>  +? loopendPatternMatch();
> >> ? 
> >> ? ? do {
> >> ? ? ? ++NumIter;
> >>  @@ -972,103 +895,96 @@
> >> ? ? return NumMatch;
> >> ? }
> >> ? 
> >>  -int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) 
> > {
> >>  -? int NumMatch = 0;
> >>  -? NumMatch += loopendPatternMatch();
> >>  -? NumMatch += serialPatternMatch(MBB);
> >>  -? NumMatch += ifPatternMatch(MBB);
> >>  -? return NumMatch;
> >>  +bool AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB) 
> > {
> >>  +? bool ChangedInSingleIteration, Changed = false;
> >>  +? do {
> >>  +? ? ? ChangedInSingleIteration = false;
> >>  +? ? ? DEBUG(dbgs() << "Pattern matching starting from BB#" 
> > << MBB->getNumber()
> >>  +? ? ? ? ? << "\n";);
> >>  +? ? ? ChangedInSingleIteration |= serialPatternMatch(MBB);
> >>  +? ? ? ChangedInSingleIteration |= ifPatternMatch(MBB);
> >>  +? ? ? Changed |= ChangedInSingleIteration;
> >>  +? } while (ChangedInSingleIteration);
> >>  +
> >>  +? return Changed;
> >> ? }
> >> ? 
> >> ? 
> >>  -int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) 
> > {
> >>  +bool AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB) 
> > {
> >> ? ? if (MBB->succ_size() != 1)
> >>  -? ? return 0;
> >>  +? ? return false;
> >> ? 
> >> ? ? MachineBasicBlock *childBlk = *MBB->succ_begin();
> >>  -? if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
> >>  -? ? return 0;
> >>  +? if (childBlk->pred_size() != 1)
> >>  +? ? return false;
> >> ? 
> >> ? ? mergeSerialBlock(MBB, childBlk);
> >> ? ? ++numSerialPatternMatch;
> >>  -? return 1;
> >>  +? return true;
> >> ? }
> >> ? 
> >>  -int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
> >>  -? //two edges
> >>  +bool AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
> >> ? ? if (MBB->succ_size() != 2)
> >>  -? ? return 0;
> >>  +? ? return false;
> >> ? ? if (hasBackEdge(MBB))
> >>  -? ? return 0;
> >>  +? ? return false;
> >> ? ? MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
> >> ? ? if (!BranchMI)
> >>  -? ? return 0;
> >>  +? ? return false;
> >> ? 
> >> ? ? assert(isCondBranch(BranchMI));
> >> ? 
> >>  +? bool Changed = false;
> >> ? ? MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
> >>  -? serialPatternMatch(TrueMBB);
> >>  -? ifPatternMatch(TrueMBB);
> >> ? ? MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
> >>  -? serialPatternMatch(FalseMBB);
> >>  -? ifPatternMatch(FalseMBB);
> >>  +? DEBUG(dbgs() << "if pattern starting at BB#" << 
> > MBB->getNumber() <<
> >>  +? ? ? ", True branch to BB#" << TrueMBB->getNumber() 
> > <<
> >>  +? ? ? ", False branch to BB#" << FalseMBB->getNumber() 
> > << "\n");
> >>  +
> >>  +? Changed |= patternMatchGroup(TrueMBB);
> >>  +? Changed |= patternMatchGroup(FalseMBB);
> >>  +? DEBUG(
> >>  +? ? dbgs() << "BB#" << TrueMBB->getNumber() 
> > << "'successors :";
> >>  +? ? for (MachineBasicBlock::succ_iterator I = TrueMBB->succ_begin(),
> >>  +? ? ? ? E = TrueMBB->succ_end(); I != E; ++I)
> >>  +? ? ? dbgs() << "BB#" << (*I)->getNumber() 
> > <<", ";
> >>  +? ? dbgs() << "\n";
> >>  +? ? dbgs() << "BB#" << FalseMBB->getNumber() 
> > << "'successors :";
> >>  +? ? for (MachineBasicBlock::succ_iterator I = FalseMBB->succ_begin(),
> >>  +? ? ? ? E = FalseMBB->succ_end(); I != E; ++I)
> >>  +? ? ? dbgs() << "BB#" << (*I)->getNumber() 
> > << ", ";
> >>  +? ? dbgs() << "\n";
> >>  +);
> >> ? ? MachineBasicBlock *LandBlk;
> >>  -? int Cloned = 0;
> >> ? 
> >> ? ? assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
> >>  -? // TODO: Simplify
> >> ? ? if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() == 
> > 1
> >>  -? ? && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()) 
> > {
> >>  +? ? && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()
> >>  +? ? && TrueMBB->pred_size() == 1 && 
> > FalseMBB->pred_size() == 1) {
> >> ? ? ? // Diamond pattern
> >> ? ? ? LandBlk = *TrueMBB->succ_begin();
> >>  -? } else if (TrueMBB->succ_size() == 1 && 
> > *TrueMBB->succ_begin() == FalseMBB) {
> >>  +? } else if (TrueMBB->succ_size() == 1 && 
> > *TrueMBB->succ_begin() == FalseMBB &&
> >>  +? ? ? TrueMBB->pred_size() == 1) {
> >> ? ? ? // Triangle pattern, false is empty
> >> ? ? ? LandBlk = FalseMBB;
> >> ? ? ? FalseMBB = NULL;
> >>  -? } else if (FalseMBB->succ_size() == 1
> >>  +? } else if (FalseMBB->succ_size() == 1 && 
> > FalseMBB->pred_size() == 1
> >> ? ? ? ? ? ? ?  && *FalseMBB->succ_begin() == TrueMBB) {
> >> ? ? ? // Triangle pattern, true is empty
> >> ? ? ? // We reverse the predicate to make a triangle, empty false pattern;
> >> ? ? ? std::swap(TrueMBB, FalseMBB);
> >> ? ? ? reversePredicateSetter(MBB->end());
> >> ? ? ? LandBlk = FalseMBB;
> >> ? ? ? FalseMBB = NULL;
> >>  -? } else if (FalseMBB->succ_size() == 1
> >>  -? ? ? ? ? ?  && isSameloopDetachedContbreak(TrueMBB, FalseMBB)) 
> > {
> >>  -? ? LandBlk = *FalseMBB->succ_begin();
> >>  -? } else if (TrueMBB->succ_size() == 1
> >>  -? ? && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
> >>  -? ? LandBlk = *TrueMBB->succ_begin();
> >>  -? } else {
> >>  -? ? return handleJumpintoIf(MBB, TrueMBB, FalseMBB);
> >>  -? }
> >>  -
> >>  -? // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but 
> > the
> >>  -? // new BB created for landBlk==NULL may introduce new challenge to the
> >>  -? // reduction process.
> >>  -? if (LandBlk &&
> >>  -? ? ? ((TrueMBB && TrueMBB->pred_size() > 1)
> >>  -? ? ? || (FalseMBB && FalseMBB->pred_size() > 1))) {
> >>  -? ?  Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB, 
> > &LandBlk);
> >>  -? }
> >>  +? } else
> >>  +? ? return Changed;
> >> ? 
> >>  -? if (TrueMBB && TrueMBB->pred_size() > 1) {
> >>  +? if (TrueMBB && TrueMBB->pred_size() > 1)
> >> ? ? ? TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
> >>  -? ? ++Cloned;
> >>  -? }
> >>  -
> >>  -? if (FalseMBB && FalseMBB->pred_size() > 1) {
> >>  +? if (FalseMBB && FalseMBB->pred_size() > 1)
> >> ? ? ? FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
> >>  -? ? ++Cloned;
> >>  -? }
> >>  -
> >> ? ? mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);
> >>  -
> >>  -? ++numIfPatternMatch;
> >>  -
> >>  -? numClonedBlock += Cloned;
> >>  -
> >>  -? return 1 + Cloned;
> >>  +? return true;
> >> ? }
> >> ? 
> >> ? int AMDGPUCFGStructurizer::loopendPatternMatch() {
> >>  @@ -1129,11 +1045,7 @@
> >> ? ? for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
> >> ? ? ? settleLoopcontBlock(LatchBlks[i], LoopHeader);
> >> ? ? int Match = 0;
> >>  -? do {
> >>  -? ? Match = 0;
> >>  -? ? Match += serialPatternMatch(LoopHeader);
> >>  -? ? Match += ifPatternMatch(LoopHeader);
> >>  -? } while (Match > 0);
> >>  +? patternMatchGroup(LoopHeader);
> >> ? ? mergeLooplandBlock(LoopHeader, ExitBlk);
> >> ? ? MachineLoop *ParentLoop = LoopRep->getParentLoop();
> >> ? ? if (ParentLoop)
> >>  @@ -1171,302 +1083,6 @@
> >> ? ? return NumCont;
> >> ? }
> >> ? 
> >>  -
> >>  -bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
> >>  -? ? MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
> >>  -? if (Src1MBB->succ_size() == 0) {
> >>  -? ? MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
> >>  -? ? if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
> >>  -? ? ? MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
> >>  -? ? ? if (TheEntry) {
> >>  -? ? ? ? DEBUG(
> >>  -? ? ? ? ? dbgs() << "isLoopContBreakBlock yes src1 = BB"
> >>  -? ? ? ? ? ? ? ?  << Src1MBB->getNumber()
> >>  -? ? ? ? ? ? ? ?  << " src2 = BB" << 
> > Src2MBB->getNumber() << "\n";
> >>  -? ? ? ? );
> >>  -? ? ? ? return true;
> >>  -? ? ? }
> >>  -? ? }
> >>  -? }
> >>  -? return false;
> >>  -}
> >>  -
> >>  -int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
> >>  -? ? MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
> >>  -? int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
> >>  -? if (Num == 0) {
> >>  -? ? DEBUG(
> >>  -? ? ? dbgs() << "handleJumpintoIf swap trueBlk and 
> > FalseBlk" << "\n";
> >>  -? ? );
> >>  -? ? Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
> >>  -? }
> >>  -? return Num;
> >>  -}
> >>  -
> >>  -int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
> >>  -? ? MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
> >>  -? int Num = 0;
> >>  -? MachineBasicBlock *DownBlk;
> >>  -
> >>  -? //trueBlk could be the common post dominator
> >>  -? DownBlk = TrueMBB;
> >>  -
> >>  -? DEBUG(
> >>  -? ? dbgs() << "handleJumpintoIfImp head = BB" << 
> > HeadMBB->getNumber()
> >>  -? ? ? ? ?  << " true = BB" << 
> > TrueMBB->getNumber()
> >>  -? ? ? ? ?  << ", numSucc=" << 
> > TrueMBB->succ_size()
> >>  -? ? ? ? ?  << " false = BB" << 
> > FalseMBB->getNumber() << "\n";
> >>  -? );
> >>  -
> >>  -? while (DownBlk) {
> >>  -? ? DEBUG(
> >>  -? ? ? dbgs() << "check down = BB" << 
> > DownBlk->getNumber();
> >>  -? ? );
> >>  -
> >>  -? ? if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
> >>  -? ? ? DEBUG(
> >>  -? ? ? ? dbgs() << " working\n";
> >>  -? ? ? );
> >>  -
> >>  -? ? ? Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
> >>  -? ? ? Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
> >>  -
> >>  -? ? ? numClonedBlock += Num;
> >>  -? ? ? Num += serialPatternMatch(*HeadMBB->succ_begin());
> >>  -? ? ? Num += serialPatternMatch(*llvm::next(HeadMBB->succ_begin()));
> >>  -? ? ? Num += ifPatternMatch(HeadMBB);
> >>  -? ? ? assert(Num > 0);
> >>  -
> >>  -? ? ? break;
> >>  -? ? }
> >>  -? ? DEBUG(
> >>  -? ? ? dbgs() << " not working\n";
> >>  -? ? );
> >>  -? ? DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin()) 
> > : NULL;
> >>  -? } // walk down the postDomTree
> >>  -
> >>  -? return Num;
> >>  -}
> >>  -
> >>  -void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
> >>  -? ? MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
> >>  -? ? MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail) 
> > {
> >>  -? dbgs() << "head = BB" << HeadMBB->getNumber()
> >>  -? ? ? ?  << " size = " << HeadMBB->size();
> >>  -? if (Detail) {
> >>  -? ? dbgs() << "\n";
> >>  -? ? HeadMBB->print(dbgs());
> >>  -? ? dbgs() << "\n";
> >>  -? }
> >>  -
> >>  -? if (TrueMBB) {
> >>  -? ? dbgs() << ", true = BB" << 
> > TrueMBB->getNumber() << " size = "
> >>  -? ? ? ? ?  << TrueMBB->size() << " numPred = " 
> > << TrueMBB->pred_size();
> >>  -? ? if (Detail) {
> >>  -? ? ? dbgs() << "\n";
> >>  -? ? ? TrueMBB->print(dbgs());
> >>  -? ? ? dbgs() << "\n";
> >>  -? ? }
> >>  -? }
> >>  -? if (FalseMBB) {
> >>  -? ? dbgs() << ", false = BB" << 
> > FalseMBB->getNumber() << " size = "
> >>  -? ? ? ? ?  << FalseMBB->size() << " numPred = " 
> > << FalseMBB->pred_size();
> >>  -? ? if (Detail) {
> >>  -? ? ? dbgs() << "\n";
> >>  -? ? ? FalseMBB->print(dbgs());
> >>  -? ? ? dbgs() << "\n";
> >>  -? ? }
> >>  -? }
> >>  -? if (LandMBB) {
> >>  -? ? dbgs() << ", land = BB" << 
> > LandMBB->getNumber() << " size = "
> >>  -? ? ? ? ?  << LandMBB->size() << " numPred = " 
> > << LandMBB->pred_size();
> >>  -? ? if (Detail) {
> >>  -? ? ? dbgs() << "\n";
> >>  -? ? ? LandMBB->print(dbgs());
> >>  -? ? ? dbgs() << "\n";
> >>  -? ? }
> >>  -? }
> >>  -
> >>  -? ? dbgs() << "\n";
> >>  -}
> >>  -
> >>  -int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock 
> > *HeadMBB,
> >>  -? ? MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
> >>  -? ? MachineBasicBlock **LandMBBPtr) {
> >>  -? bool MigrateTrue = false;
> >>  -? bool MigrateFalse = false;
> >>  -
> >>  -? MachineBasicBlock *LandBlk = *LandMBBPtr;
> >>  -
> >>  -? assert((!TrueMBB || TrueMBB->succ_size() <= 1)
> >>  -? ? ? ?  && (!FalseMBB || FalseMBB->succ_size() <= 1));
> >>  -
> >>  -? if (TrueMBB == FalseMBB)
> >>  -? ? return 0;
> >>  -
> >>  -? MigrateTrue = needMigrateBlock(TrueMBB);
> >>  -? MigrateFalse = needMigrateBlock(FalseMBB);
> >>  -
> >>  -? if (!MigrateTrue && !MigrateFalse)
> >>  -? ? return 0;
> >>  -
> >>  -? // If we need to migrate either trueBlk and falseBlk, migrate the rest 
> > that
> >>  -? // have more than one predecessors.? without doing this, its predecessor
> >>  -? // rather than headBlk will have undefined value in initReg.
> >>  -? if (!MigrateTrue && TrueMBB && TrueMBB->pred_size() 
> >>  1)
> >>  -? ? MigrateTrue = true;
> >>  -? if (!MigrateFalse && FalseMBB && 
> > FalseMBB->pred_size() > 1)
> >>  -? ? MigrateFalse = true;
> >>  -
> >>  -? DEBUG(
> >>  -? ? dbgs() << "before improveSimpleJumpintoIf: ";
> >>  -? ? showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
> >>  -? );
> >>  -
> >>  -? // org: headBlk => if () {trueBlk} else {falseBlk} => 
> > landBlk
> >>  -? //
> >>  -? // new: headBlk => if () {initReg = 1; org trueBlk branch} else
> >>  -? //? ? ? {initReg = 0; org falseBlk branch }
> >>  -? //? ? ? => landBlk => if (initReg) {org trueBlk} else 
> > {org falseBlk}
> >>  -? //? ? ? => org landBlk
> >>  -? //? ? ? if landBlk->pred_size() > 2, put the about if-else inside
> >>  -? //? ? ? if (initReg !=2) {...}
> >>  -? //
> >>  -? // add initReg = initVal to headBlk
> >>  -
> >>  -? const TargetRegisterClass * I32RC = 
> > TRI->getCFGStructurizerRegClass(MVT::i32);
> >>  -? if (!MigrateTrue || !MigrateFalse) {
> >>  -? ? // XXX: We have an opportunity here to optimize the "branch into 
> > if" case
> >>  -? ? // here.? Branch into if looks like this:
> >>  -? ? //? ? ? ? ? ? ? ? ? ? ? ? entry
> >>  -? ? //? ? ? ? ? ? ? ? ? ? ?  /? ?  |
> >>  -? ? //? ? ? ? ?  diamond_head? ? ?  branch_from
> >>  -? ? //? ? ? ? ? ?  /? ? ? \? ? ? ? ?  |
> >>  -? ? // diamond_false? ? ? ? diamond_true
> >>  -? ? //? ? ? ? ? ?  \? ? ? /
> >>  -? ? //? ? ? ? ? ? ?  done
> >>  -? ? //
> >>  -? ? // The diamond_head block begins the "if" and the 
> > diamond_true block
> >>  -? ? // is the block being "branched into".
> >>  -? ? //
> >>  -? ? // If MigrateTrue is true, then TrueBB is the block being 
> > "branched into"
> >>  -? ? // and if MigrateFalse is true, then FalseBB is the block being
> >>  -? ? // "branched into"
> >>  -? ? // 
> >>  -? ? // Here is the pseudo code for how I think the optimization should 
> > work:
> >>  -? ? // 1. Insert MOV GPR0, 0 before the branch instruction in 
> > diamond_head.
> >>  -? ? // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
> >>  -? ? // 3. Move the branch instruction from diamond_head into its own basic
> >>  -? ? //? ? block (new_block).
> >>  -? ? // 4. Add an unconditional branch from diamond_head to new_block
> >>  -? ? // 5. Replace the branch instruction in branch_from with an 
> > unconditional
> >>  -? ? //? ? branch to new_block.? If branch_from has multiple predecessors, 
> > then
> >>  -? ? //? ? we need to replace the True/False block in the branch
> >>  -? ? //? ? instruction instead of replacing it.
> >>  -? ? // 6. Change the condition of the branch instruction in new_block from
> >>  -? ? //? ? COND to (COND || GPR0)
> >>  -? ? //
> >>  -? ? // In order insert these MOV instruction, we will need to use the
> >>  -? ? // RegisterScavenger.? Usually liveness stops being tracked during
> >>  -? ? // the late machine optimization passes, however if we implement
> >>  -? ? // bool TargetRegisterInfo::requiresRegisterScavenging(
> >>  -? ? //? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? const 
> > MachineFunction &MF)
> >>  -? ? // and have it return true, liveness will be tracked correctly 
> >>  -? ? // by generic optimization passes.? We will also need to make sure 
> > that
> >>  -? ? // all of our target-specific passes that run after regalloc and 
> > before
> >>  -? ? // the CFGStructurizer track liveness and we will need to modify this 
> > pass
> >>  -? ? // to correctly track liveness.
> >>  -? ? //
> >>  -? ? // After the above changes, the new CFG should look like this:
> >>  -? ? //? ? ? ? ? ? ? ? ? ? ? ? entry
> >>  -? ? //? ? ? ? ? ? ? ? ? ? ?  /? ?  |
> >>  -? ? //? ? ? ? ?  diamond_head? ? ?  branch_from
> >>  -? ? //? ? ? ? ? ? ? ? ? ? ?  \? ?  /
> >>  -? ? //? ? ? ? ? ? ? ? ? ? ? new_block
> >>  -? ? //? ? ? ? ? ? ? ? ? ? ? /? ? ? |
> >>  -? ? //? ? ? ?  diamond_false? ? ? ? diamond_true
> >>  -? ? //? ? ? ? ? ? ? ? ? ? ? \? ? ? /
> >>  -? ? //? ? ? ? ? ? ? ? ? ? ? ? done
> >>  -? ? //
> >>  -? ? // Without this optimization, we are forced to duplicate the 
> > diamond_true
> >>  -? ? // block and we will end up with a CFG like this:
> >>  -? ? //
> >>  -? ? //? ? ? ? ? ? ? ? ? ? ? ? entry
> >>  -? ? //? ? ? ? ? ? ? ? ? ? ?  /? ?  |
> >>  -? ? //? ? ? ? ?  diamond_head? ? ?  branch_from
> >>  -? ? //? ? ? ? ? ?  /? ? ? \? ? ? ? ? ? ? ? ?  |
> >>  -? ? // diamond_false? ? ? ? diamond_true? ? ? diamond_true (duplicate)
> >>  -? ? //? ? ? ? ? ?  \? ? ? /? ? ? ? ? ? ? ? ?  |
> >>  -? ? //? ? ? ? ? ? ?  done --------------------|
> >>  -? ? //
> >>  -? ? // Duplicating diamond_true can be very costly especially if it has a
> >>  -? ? // lot of instructions.
> >>  -? ? return 0;
> >>  -? }
> >>  -
> >>  -? int NumNewBlk = 0;
> >>  -
> >>  -? bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
> >>  -
> >>  -? //insert AMDGPU::ENDIF to avoid special case "input landBlk == 
> > NULL"
> >>  -? MachineBasicBlock::iterator I = insertInstrBefore(LandBlk, 
> > AMDGPU::ENDIF);
> >>  -
> >>  -? if (LandBlkHasOtherPred) {
> >>  -? ? llvm_unreachable("Extra register needed to handle CFG");
> >>  -? ? unsigned CmpResReg =
> >>  -? ? ? 
> > HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
> >>  -? ? llvm_unreachable("Extra compare instruction needed to handle 
> > CFG");
> >>  -? ? insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
> >>  -? ? ? ? CmpResReg, DebugLoc());
> >>  -? }
> >>  -
> >>  -? // XXX: We are running this after RA, so creating virtual registers will
> >>  -? // cause an assertion failure in the PostRA scheduling pass.
> >>  -? unsigned InitReg =
> >>  -? ? HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
> >>  -? insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
> >>  -? ? ? DebugLoc());
> >>  -
> >>  -? if (MigrateTrue) {
> >>  -? ? migrateInstruction(TrueMBB, LandBlk, I);
> >>  -? ? // need to uncondionally insert the assignment to ensure a path from 
> > its
> >>  -? ? // predecessor rather than headBlk has valid value in initReg if
> >>  -? ? // (initVal != 1).
> >>  -? ? llvm_unreachable("Extra register needed to handle CFG");
> >>  -? }
> >>  -? insertInstrBefore(I, AMDGPU::ELSE);
> >>  -
> >>  -? if (MigrateFalse) {
> >>  -? ? migrateInstruction(FalseMBB, LandBlk, I);
> >>  -? ? // need to uncondionally insert the assignment to ensure a path from 
> > its
> >>  -? ? // predecessor rather than headBlk has valid value in initReg if
> >>  -? ? // (initVal != 0)
> >>  -? ? llvm_unreachable("Extra register needed to handle CFG");
> >>  -? }
> >>  -
> >>  -? if (LandBlkHasOtherPred) {
> >>  -? ? // add endif
> >>  -? ? insertInstrBefore(I, AMDGPU::ENDIF);
> >>  -
> >>  -? ? // put initReg = 2 to other predecessors of landBlk
> >>  -? ? for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
> >>  -? ? ? ?  PE = LandBlk->pred_end(); PI != PE; ++PI) {
> >>  -? ? ? MachineBasicBlock *MBB = *PI;
> >>  -? ? ? if (MBB != TrueMBB && MBB != FalseMBB)
> >>  -? ? ? ? llvm_unreachable("Extra register needed to handle CFG");
> >>  -? ? }
> >>  -? }
> >>  -? DEBUG(
> >>  -? ? dbgs() << "result from improveSimpleJumpintoIf: ";
> >>  -? ? showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
> >>  -? );
> >>  -
> >>  -? // update landBlk
> >>  -? *LandMBBPtr = LandBlk;
> >>  -
> >>  -? return NumNewBlk;
> >>  -}
> >>  -
> >> ? void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock 
> > *ContingMBB,
> >> ? ? ? MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
> >> ? ? ? MachineLoop *ContLoop) {
> >>  @@ -1637,24 +1253,6 @@
> >> ? ? }
> >> ? }
> >> ? 
> >>  -int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
> >>  -? ? MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
> >>  -? int Cloned = 0;
> >>  -? assert(PreMBB->isSuccessor(SrcMBB));
> >>  -? while (SrcMBB && SrcMBB != DstMBB) {
> >>  -? ? assert(SrcMBB->succ_size() == 1);
> >>  -? ? if (SrcMBB->pred_size() > 1) {
> >>  -? ? ? SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
> >>  -? ? ? ++Cloned;
> >>  -? ? }
> >>  -
> >>  -? ? PreMBB = SrcMBB;
> >>  -? ? SrcMBB = *SrcMBB->succ_begin();
> >>  -? }
> >>  -
> >>  -? return Cloned;
> >>  -}
> >>  -
> >> ? MachineBasicBlock *
> >> ? AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
> >> ? ? ? MachineBasicBlock *PredMBB) {
> >>  @@ -1683,37 +1281,6 @@
> >> ? ? return CloneMBB;
> >> ? }
> >> ? 
> >>  -void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
> >>  -? ? MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
> >>  -? MachineBasicBlock::iterator SpliceEnd;
> >>  -? //look for the input branchinstr, not the AMDGPU branchinstr
> >>  -? MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
> >>  -? if (!BranchMI) {
> >>  -? ? DEBUG(
> >>  -? ? ? dbgs() << "migrateInstruction don't see branch 
> > instr\n" ;
> >>  -? ? );
> >>  -? ? SpliceEnd = SrcMBB->end();
> >>  -? } else {
> >>  -? ? DEBUG(
> >>  -? ? ? dbgs() << "migrateInstruction see branch 
> > instr\n" ;
> >>  -? ? ? BranchMI->dump();
> >>  -? ? );
> >>  -? ? SpliceEnd = BranchMI;
> >>  -? }
> >>  -? DEBUG(
> >>  -? ? dbgs() << "migrateInstruction before splice dstSize = 
> > " << DstMBB->size()
> >>  -? ? ? << "srcSize = " << SrcMBB->size() << 
> > "\n";
> >>  -? );
> >>  -
> >>  -? //splice insert before insertPos
> >>  -? DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
> >>  -
> >>  -? DEBUG(
> >>  -? ? dbgs() << "migrateInstruction after splice dstSize = " 
> > << DstMBB->size()
> >>  -? ? ? << "srcSize = " << SrcMBB->size() << 
> > "\n";
> >>  -? );
> >>  -}
> >>  -
> >> ? MachineBasicBlock *
> >> ? AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep) 
> > {
> >> ? ? MachineBasicBlock *LoopHeader = LoopRep->getHeader();
> >>  @@ -1839,60 +1406,6 @@
> >> ? ? );
> >> ? }
> >> ? 
> >>  -MachineBasicBlock *
> >>  -AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
> >>  -? ? MachineBasicBlock *MBB2) {
> >>  -
> >>  -? if (PDT->dominates(MBB1, MBB2))
> >>  -? ? return MBB1;
> >>  -? if (PDT->dominates(MBB2, MBB1))
> >>  -? ? return MBB2;
> >>  -
> >>  -? MachineDomTreeNode *Node1 = PDT->getNode(MBB1);
> >>  -? MachineDomTreeNode *Node2 = PDT->getNode(MBB2);
> >>  -
> >>  -? // Handle newly cloned node.
> >>  -? if (!Node1 && MBB1->succ_size() == 1)
> >>  -? ? return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2);
> >>  -? if (!Node2 && MBB2->succ_size() == 1)
> >>  -? ? return findNearestCommonPostDom(MBB1, *MBB2->succ_begin());
> >>  -
> >>  -? if (!Node1 || !Node2)
> >>  -? ? return NULL;
> >>  -
> >>  -? Node1 = Node1->getIDom();
> >>  -? while (Node1) {
> >>  -? ? if (PDT->dominates(Node1, Node2))
> >>  -? ? ? return Node1->getBlock();
> >>  -? ? Node1 = Node1->getIDom();
> >>  -? }
> >>  -
> >>  -? return NULL;
> >>  -}
> >>  -
> >>  -MachineBasicBlock *
> >>  -AMDGPUCFGStructurizer::findNearestCommonPostDom(
> >>  -? ? std::set<MachineBasicBlock *> &MBBs) {
> >>  -? MachineBasicBlock *CommonDom;
> >>  -? std::set<MachineBasicBlock *>::const_iterator It = MBBs.begin();
> >>  -? std::set<MachineBasicBlock *>::const_iterator E = MBBs.end();
> >>  -? for (CommonDom = *It; It != E && CommonDom; ++It) {
> >>  -? ? MachineBasicBlock *MBB = *It;
> >>  -? ? if (MBB != CommonDom)
> >>  -? ? ? CommonDom = findNearestCommonPostDom(MBB, CommonDom);
> >>  -? }
> >>  -
> >>  -? DEBUG(
> >>  -? ? dbgs() << "Common post dominator for exit blocks is ";
> >>  -? ? if (CommonDom)
> >>  -? ? ? ? ? dbgs() << "BB" << 
> > CommonDom->getNumber() << "\n";
> >>  -? ? else
> >>  -? ? ? dbgs() << "NULL\n";
> >>  -? );
> >>  -
> >>  -? return CommonDom;
> >>  -}
> >>  -
> >> ? char AMDGPUCFGStructurizer::ID = 0;
> >> ? 
> >> ? } // end anonymous namespace
> > 
> >>  _______________________________________________
> >>  llvm-commits mailing list
> >>  llvm-commits at cs.uiuc.edu
> >>  http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> >
-------------- next part --------------
; ModuleID = 'radeon'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
target triple = "r600--"

; Function Attrs: nounwind
define void @resizeLN_C1_D0(i8 addrspace(1)* nocapture %dst, i8 addrspace(1)* noalias nocapture readonly %src, i32 %dstoffset_in_pixel, i32 %srcoffset_in_pixel, i32 %dststep_in_pixel, i32 %srcstep_in_pixel, i32 %src_cols, i32 %src_rows, i32 %dst_cols, i32 %dst_rows, float %ifx, float %ify) #0 {
entry:
  %x.i.i = tail call i32 @llvm.r600.read.tgid.x() #1
  %x.i12.i = tail call i32 @llvm.r600.read.local.size.x() #1
  %mul26.i = mul i32 %x.i12.i, %x.i.i
  %x.i4.i = tail call i32 @llvm.r600.read.tidig.x() #1
  %add.i = add i32 %x.i4.i, %mul26.i
  %y.i.i = tail call i32 @llvm.r600.read.tgid.y() #1
  %y.i14.i = tail call i32 @llvm.r600.read.local.size.y() #1
  %mul30.i = mul i32 %y.i14.i, %y.i.i
  %y.i6.i = tail call i32 @llvm.r600.read.tidig.y() #1
  %add.i454 = add i32 %y.i6.i, %mul30.i
  %shl = shl i32 %add.i, 2
  %and = and i32 %dstoffset_in_pixel, 3
  %sub = sub nsw i32 %shl, %and
  %add = add nsw i32 %sub, 1
  %add3 = add nsw i32 %sub, 2
  %add5 = add nsw i32 %sub, 3
  %conv.i.i.i449 = sitofp i32 %sub to float
  %0 = insertelement <4 x float> undef, float %conv.i.i.i449, i32 0
  %conv.i4.i.i450 = sitofp i32 %add to float
  %1 = insertelement <4 x float> %0, float %conv.i4.i.i450, i32 1
  %conv.i.i5.i451 = sitofp i32 %add3 to float
  %2 = insertelement <4 x float> undef, float %conv.i.i5.i451, i32 0
  %conv.i4.i7.i452 = sitofp i32 %add5 to float
  %3 = insertelement <4 x float> %2, float %conv.i4.i7.i452, i32 1
  %vecinit3.i453 = shufflevector <4 x float> %1, <4 x float> %3, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  %add8 = fadd <4 x float> %vecinit3.i453, <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
  %splat.splatinsert = insertelement <4 x float> undef, float %ifx, i32 0
  %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
  %4 = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %add8, <4 x float> %splat.splat, <4 x float> <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>)
  %call9 = tail call <4 x float> @llvm.floor.v4f32(<4 x float> %4)
  %5 = extractelement <4 x float> %call9, i32 0
  %conv.i.i.i444 = fptosi float %5 to i32
  %6 = insertelement <4 x i32> undef, i32 %conv.i.i.i444, i32 0
  %7 = extractelement <4 x float> %call9, i32 1
  %conv.i4.i.i445 = fptosi float %7 to i32
  %8 = insertelement <4 x i32> %6, i32 %conv.i4.i.i445, i32 1
  %9 = extractelement <4 x float> %call9, i32 2
  %conv.i.i5.i446 = fptosi float %9 to i32
  %10 = insertelement <4 x i32> undef, i32 %conv.i.i5.i446, i32 0
  %11 = extractelement <4 x float> %call9, i32 3
  %conv.i4.i7.i447 = fptosi float %11 to i32
  %12 = insertelement <4 x i32> %10, i32 %conv.i4.i7.i447, i32 1
  %vecinit3.i448 = shufflevector <4 x i32> %8, <4 x i32> %12, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  %sub11 = fsub <4 x float> %4, %call9
  %conv = sitofp i32 %add.i454 to float
  %add12 = fadd float %conv, 5.000000e-01
  %13 = tail call float @llvm.fmuladd.f32(float %add12, float %ify, float -5.000000e-01)
  %call13 = tail call float @llvm.floor.f32(float %13)
  %conv14 = fptosi float %call13 to i32
  %call10.lobit = ashr <4 x i32> %vecinit3.i448, <i32 31, i32 31, i32 31, i32 31>
  %14 = xor <4 x i32> %call10.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
  %15 = bitcast <4 x float> %sub11 to <4 x i32>
  %16 = and <4 x i32> %15, %14
  %splat.splatinsert18 = insertelement <4 x i32> undef, i32 %src_cols, i32 0
  %splat.splat19 = shufflevector <4 x i32> %splat.splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer
  %cmp20 = icmp sge <4 x i32> %vecinit3.i448, %splat.splat19
  %17 = bitcast <4 x i32> %16 to <4 x float>
  %18 = and <4 x i32> %vecinit3.i448, %14
  %cmp30 = icmp sge <4 x i32> %18, %splat.splat19
  %sub32 = add nsw i32 %src_cols, -1
  %splat.splatinsert33 = insertelement <4 x i32> undef, i32 %sub32, i32 0
  %splat.splat34 = shufflevector <4 x i32> %splat.splatinsert33, <4 x i32> undef, <4 x i32> zeroinitializer
  %cond36 = select <4 x i1> %cmp30, <4 x i32> %splat.splat34, <4 x i32> %18
  %cmp37 = icmp slt i32 %conv14, 0
  br i1 %cmp37, label %cond.end, label %cond.false

cond.false:                                       ; preds = %entry
  %conv15 = sitofp i32 %conv14 to float
  %sub16 = fsub float %13, %conv15
  %phitmp = fmul float %sub16, 2.048000e+03
  br label %cond.end

cond.end:                                         ; preds = %cond.false, %entry
  %v.0 = phi float [ %phitmp, %cond.false ], [ 0.000000e+00, %entry ]
  %y.0 = phi i32 [ %conv14, %cond.false ], [ 0, %entry ]
  %cmp41 = icmp slt i32 %y.0, %src_rows
  %sub44 = add nsw i32 %src_rows, -1
  %y.1 = select i1 %cmp41, i32 %y.0, i32 %sub44
  %.op = fmul <4 x float> %17, <float 2.048000e+03, float 2.048000e+03, float 2.048000e+03, float 2.048000e+03>
  %mul = select <4 x i1> %cmp20, <4 x float> zeroinitializer, <4 x float> %.op
  %sub49 = fsub <4 x float> <float 2.048000e+03, float 2.048000e+03, float 2.048000e+03, float 2.048000e+03>, %mul
  %call50 = tail call <4 x float> @llvm.rint.v4f32(<4 x float> %mul)
  %19 = extractelement <4 x float> %call50, i32 0
  %conv.i.i.i439 = fptosi float %19 to i32
  %20 = insertelement <4 x i32> undef, i32 %conv.i.i.i439, i32 0
  %21 = extractelement <4 x float> %call50, i32 1
  %conv.i4.i.i440 = fptosi float %21 to i32
  %22 = insertelement <4 x i32> %20, i32 %conv.i4.i.i440, i32 1
  %23 = extractelement <4 x float> %call50, i32 2
  %conv.i.i5.i441 = fptosi float %23 to i32
  %24 = insertelement <4 x i32> undef, i32 %conv.i.i5.i441, i32 0
  %25 = extractelement <4 x float> %call50, i32 3
  %conv.i4.i7.i442 = fptosi float %25 to i32
  %26 = insertelement <4 x i32> %24, i32 %conv.i4.i7.i442, i32 1
  %vecinit3.i443 = shufflevector <4 x i32> %22, <4 x i32> %26, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  %call52 = tail call <4 x float> @llvm.rint.v4f32(<4 x float> %sub49)
  %27 = extractelement <4 x float> %call52, i32 0
  %conv.i.i.i = fptosi float %27 to i32
  %28 = insertelement <4 x i32> undef, i32 %conv.i.i.i, i32 0
  %29 = extractelement <4 x float> %call52, i32 1
  %conv.i4.i.i = fptosi float %29 to i32
  %30 = insertelement <4 x i32> %28, i32 %conv.i4.i.i, i32 1
  %31 = extractelement <4 x float> %call52, i32 2
  %conv.i.i5.i = fptosi float %31 to i32
  %32 = insertelement <4 x i32> undef, i32 %conv.i.i5.i, i32 0
  %33 = extractelement <4 x float> %call52, i32 3
  %conv.i4.i7.i = fptosi float %33 to i32
  %34 = insertelement <4 x i32> %32, i32 %conv.i4.i7.i, i32 1
  %vecinit3.i = shufflevector <4 x i32> %30, <4 x i32> %34, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  %mul54 = select i1 %cmp41, float %v.0, float 0.000000e+00
  %call55 = tail call float @llvm.rint.f32(float %mul54)
  %conv56 = fptosi float %call55 to i32
  %sub57 = fsub float 2.048000e+03, %mul54
  %call58 = tail call float @llvm.rint.f32(float %sub57)
  %conv59 = fptosi float %call58 to i32
  %add60 = add nsw i32 %y.1, 1
  %cmp61 = icmp sge i32 %add60, %src_rows
  %cond67 = select i1 %cmp61, i32 %y.1, i32 %add60
  %add68 = add <4 x i32> %cond36, <i32 1, i32 1, i32 1, i32 1>
  %cmp71 = icmp sge <4 x i32> %add68, %splat.splat19
  %sext72 = sext <4 x i1> %cmp71 to <4 x i32>
  %cmp73 = icmp ne <4 x i32> %sext72, zeroinitializer
  %cond77 = select <4 x i1> %cmp73, <4 x i32> %cond36, <4 x i32> %add68
  %splat.splatinsert78 = insertelement <4 x i32> undef, i32 %y.1, i32 0
  %splat.splat79 = shufflevector <4 x i32> %splat.splatinsert78, <4 x i32> undef, <4 x i32> zeroinitializer
  %splat.splatinsert80 = insertelement <4 x i32> undef, i32 %srcstep_in_pixel, i32 0
  %splat.splat81 = shufflevector <4 x i32> %splat.splatinsert80, <4 x i32> undef, <4 x i32> zeroinitializer
  %splat.splatinsert82 = insertelement <4 x i32> undef, i32 %srcoffset_in_pixel, i32 0
  %splat.splat83 = shufflevector <4 x i32> %splat.splatinsert82, <4 x i32> undef, <4 x i32> zeroinitializer
  %add84 = add <4 x i32> %cond36, %splat.splat83
  %shl.i.i433 = shl <4 x i32> %splat.splat79, <i32 8, i32 8, i32 8, i32 8>
  %shr.i.i434 = ashr <4 x i32> %shl.i.i433, <i32 8, i32 8, i32 8, i32 8>
  %shl1.i.i435 = shl <4 x i32> %splat.splat81, <i32 8, i32 8, i32 8, i32 8>
  %shr2.i.i436 = ashr <4 x i32> %shl1.i.i435, <i32 8, i32 8, i32 8, i32 8>
  %mul.i.i437 = mul <4 x i32> %shr.i.i434, %shr2.i.i436
  %add.i438 = add <4 x i32> %mul.i.i437, %add84
  %add92 = add <4 x i32> %cond77, %splat.splat83
  %add.i432 = add <4 x i32> %mul.i.i437, %add92
  %splat.splatinsert94 = insertelement <4 x i32> undef, i32 %cond67, i32 0
  %splat.splat95 = shufflevector <4 x i32> %splat.splatinsert94, <4 x i32> undef, <4 x i32> zeroinitializer
  %shl.i.i421 = shl <4 x i32> %splat.splat95, <i32 8, i32 8, i32 8, i32 8>
  %shr.i.i422 = ashr <4 x i32> %shl.i.i421, <i32 8, i32 8, i32 8, i32 8>
  %mul.i.i425 = mul <4 x i32> %shr.i.i422, %shr2.i.i436
  %add.i426 = add <4 x i32> %mul.i.i425, %add84
  %add.i420 = add <4 x i32> %mul.i.i425, %add92
  %35 = extractelement <4 x i32> %add.i438, i32 0
  %arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %35
  %36 = load i8 addrspace(1)* %arrayidx, align 1, !tbaa !10
  %conv110 = zext i8 %36 to i32
  %37 = insertelement <4 x i32> undef, i32 %conv110, i32 0
  %38 = extractelement <4 x i32> %add.i438, i32 1
  %arrayidx111 = getelementptr inbounds i8 addrspace(1)* %src, i32 %38
  %39 = load i8 addrspace(1)* %arrayidx111, align 1, !tbaa !10
  %conv112 = zext i8 %39 to i32
  %40 = insertelement <4 x i32> %37, i32 %conv112, i32 1
  %41 = extractelement <4 x i32> %add.i438, i32 2
  %arrayidx113 = getelementptr inbounds i8 addrspace(1)* %src, i32 %41
  %42 = load i8 addrspace(1)* %arrayidx113, align 1, !tbaa !10
  %conv114 = zext i8 %42 to i32
  %43 = insertelement <4 x i32> %40, i32 %conv114, i32 2
  %44 = extractelement <4 x i32> %add.i438, i32 3
  %arrayidx115 = getelementptr inbounds i8 addrspace(1)* %src, i32 %44
  %45 = load i8 addrspace(1)* %arrayidx115, align 1, !tbaa !10
  %conv116 = zext i8 %45 to i32
  %46 = insertelement <4 x i32> %43, i32 %conv116, i32 3
  %47 = extractelement <4 x i32> %add.i432, i32 0
  %arrayidx117 = getelementptr inbounds i8 addrspace(1)* %src, i32 %47
  %48 = load i8 addrspace(1)* %arrayidx117, align 1, !tbaa !10
  %conv118 = zext i8 %48 to i32
  %49 = insertelement <4 x i32> undef, i32 %conv118, i32 0
  %50 = extractelement <4 x i32> %add.i432, i32 1
  %arrayidx119 = getelementptr inbounds i8 addrspace(1)* %src, i32 %50
  %51 = load i8 addrspace(1)* %arrayidx119, align 1, !tbaa !10
  %conv120 = zext i8 %51 to i32
  %52 = insertelement <4 x i32> %49, i32 %conv120, i32 1
  %53 = extractelement <4 x i32> %add.i432, i32 2
  %arrayidx121 = getelementptr inbounds i8 addrspace(1)* %src, i32 %53
  %54 = load i8 addrspace(1)* %arrayidx121, align 1, !tbaa !10
  %conv122 = zext i8 %54 to i32
  %55 = insertelement <4 x i32> %52, i32 %conv122, i32 2
  %56 = extractelement <4 x i32> %add.i432, i32 3
  %arrayidx123 = getelementptr inbounds i8 addrspace(1)* %src, i32 %56
  %57 = load i8 addrspace(1)* %arrayidx123, align 1, !tbaa !10
  %conv124 = zext i8 %57 to i32
  %58 = insertelement <4 x i32> %55, i32 %conv124, i32 3
  %59 = extractelement <4 x i32> %add.i426, i32 0
  %arrayidx125 = getelementptr inbounds i8 addrspace(1)* %src, i32 %59
  %60 = load i8 addrspace(1)* %arrayidx125, align 1, !tbaa !10
  %conv126 = zext i8 %60 to i32
  %61 = insertelement <4 x i32> undef, i32 %conv126, i32 0
  %62 = extractelement <4 x i32> %add.i426, i32 1
  %arrayidx127 = getelementptr inbounds i8 addrspace(1)* %src, i32 %62
  %63 = load i8 addrspace(1)* %arrayidx127, align 1, !tbaa !10
  %conv128 = zext i8 %63 to i32
  %64 = insertelement <4 x i32> %61, i32 %conv128, i32 1
  %65 = extractelement <4 x i32> %add.i426, i32 2
  %arrayidx129 = getelementptr inbounds i8 addrspace(1)* %src, i32 %65
  %66 = load i8 addrspace(1)* %arrayidx129, align 1, !tbaa !10
  %conv130 = zext i8 %66 to i32
  %67 = insertelement <4 x i32> %64, i32 %conv130, i32 2
  %68 = extractelement <4 x i32> %add.i426, i32 3
  %arrayidx131 = getelementptr inbounds i8 addrspace(1)* %src, i32 %68
  %69 = load i8 addrspace(1)* %arrayidx131, align 1, !tbaa !10
  %conv132 = zext i8 %69 to i32
  %70 = insertelement <4 x i32> %67, i32 %conv132, i32 3
  %71 = extractelement <4 x i32> %add.i420, i32 0
  %arrayidx133 = getelementptr inbounds i8 addrspace(1)* %src, i32 %71
  %72 = load i8 addrspace(1)* %arrayidx133, align 1, !tbaa !10
  %conv134 = zext i8 %72 to i32
  %73 = insertelement <4 x i32> undef, i32 %conv134, i32 0
  %74 = extractelement <4 x i32> %add.i420, i32 1
  %arrayidx135 = getelementptr inbounds i8 addrspace(1)* %src, i32 %74
  %75 = load i8 addrspace(1)* %arrayidx135, align 1, !tbaa !10
  %conv136 = zext i8 %75 to i32
  %76 = insertelement <4 x i32> %73, i32 %conv136, i32 1
  %77 = extractelement <4 x i32> %add.i420, i32 2
  %arrayidx137 = getelementptr inbounds i8 addrspace(1)* %src, i32 %77
  %78 = load i8 addrspace(1)* %arrayidx137, align 1, !tbaa !10
  %conv138 = zext i8 %78 to i32
  %79 = insertelement <4 x i32> %76, i32 %conv138, i32 2
  %80 = extractelement <4 x i32> %add.i420, i32 3
  %arrayidx139 = getelementptr inbounds i8 addrspace(1)* %src, i32 %80
  %81 = load i8 addrspace(1)* %arrayidx139, align 1, !tbaa !10
  %conv140 = zext i8 %81 to i32
  %82 = insertelement <4 x i32> %79, i32 %conv140, i32 3
  %shl.i410 = shl <4 x i32> %vecinit3.i, <i32 8, i32 8, i32 8, i32 8>
  %shr.i411 = ashr <4 x i32> %shl.i410, <i32 8, i32 8, i32 8, i32 8>
  %shl1.i412 = shl <4 x i32> %46, <i32 8, i32 8, i32 8, i32 8>
  %shr2.i413 = ashr <4 x i32> %shl1.i412, <i32 8, i32 8, i32 8, i32 8>
  %mul.i414 = mul <4 x i32> %shr.i411, %shr2.i413
  %shl.i405 = shl <4 x i32> %vecinit3.i443, <i32 8, i32 8, i32 8, i32 8>
  %shr.i406 = ashr <4 x i32> %shl.i405, <i32 8, i32 8, i32 8, i32 8>
  %shl1.i407 = shl <4 x i32> %58, <i32 8, i32 8, i32 8, i32 8>
  %shr2.i408 = ashr <4 x i32> %shl1.i407, <i32 8, i32 8, i32 8, i32 8>
  %mul.i409 = mul <4 x i32> %shr.i406, %shr2.i408
  %add143 = add <4 x i32> %mul.i414, %mul.i409
  %shl1.i402 = shl <4 x i32> %70, <i32 8, i32 8, i32 8, i32 8>
  %shr2.i403 = ashr <4 x i32> %shl1.i402, <i32 8, i32 8, i32 8, i32 8>
  %mul.i404 = mul <4 x i32> %shr.i411, %shr2.i403
  %shl1.i397 = shl <4 x i32> %82, <i32 8, i32 8, i32 8, i32 8>
  %shr2.i398 = ashr <4 x i32> %shl1.i397, <i32 8, i32 8, i32 8, i32 8>
  %mul.i399 = mul <4 x i32> %shr.i406, %shr2.i398
  %add146 = add <4 x i32> %mul.i404, %mul.i399
  %splat.splatinsert147 = insertelement <4 x i32> undef, i32 %conv59, i32 0
  %splat.splat148 = shufflevector <4 x i32> %splat.splatinsert147, <4 x i32> undef, <4 x i32> zeroinitializer
  %shl.i390 = shl <4 x i32> %splat.splat148, <i32 8, i32 8, i32 8, i32 8>
  %shr.i391 = ashr <4 x i32> %shl.i390, <i32 8, i32 8, i32 8, i32 8>
  %shl1.i392 = shl <4 x i32> %add143, <i32 8, i32 8, i32 8, i32 8>
  %shr2.i393 = ashr <4 x i32> %shl1.i392, <i32 8, i32 8, i32 8, i32 8>
  %mul.i394 = mul <4 x i32> %shr.i391, %shr2.i393
  %splat.splatinsert150 = insertelement <4 x i32> undef, i32 %conv56, i32 0
  %splat.splat151 = shufflevector <4 x i32> %splat.splatinsert150, <4 x i32> undef, <4 x i32> zeroinitializer
  %shl.i = shl <4 x i32> %splat.splat151, <i32 8, i32 8, i32 8, i32 8>
  %shr.i = ashr <4 x i32> %shl.i, <i32 8, i32 8, i32 8, i32 8>
  %shl1.i = shl <4 x i32> %add146, <i32 8, i32 8, i32 8, i32 8>
  %shr2.i = ashr <4 x i32> %shl1.i, <i32 8, i32 8, i32 8, i32 8>
  %mul.i = mul <4 x i32> %shr.i, %shr2.i
  %add153 = add <4 x i32> %mul.i394, %mul.i
  %add154 = add <4 x i32> %add153, <i32 2097152, i32 2097152, i32 2097152, i32 2097152>
  %shr = ashr <4 x i32> %add154, <i32 22, i32 22, i32 22, i32 22>
  %add155 = add nsw i32 %sub, %dstoffset_in_pixel
  %shl.i.i = shl i32 %add.i454, 8
  %shr.i.i = ashr exact i32 %shl.i.i, 8
  %shl1.i.i = shl i32 %dststep_in_pixel, 8
  %shr2.i.i = ashr exact i32 %shl1.i.i, 8
  %mul.i.i = mul nsw i32 %shr.i.i, %shr2.i.i
  %add.i389 = add nsw i32 %add155, %mul.i.i
  %inc = add nsw i32 %add.i389, 1
  %add159 = add nsw i32 %add.i389, 2
  %add160 = add nsw i32 %add.i389, 3
  %cmp.i.i = icmp sgt <4 x i32> %shr, <i32 255, i32 255, i32 255, i32 255>
  %cmp5.i.i = icmp slt <4 x i32> %shr, zeroinitializer
  %cond.i.i = select <4 x i1> %cmp5.i.i, <4 x i32> zeroinitializer, <4 x i32> %shr
  %cond11.i.i = select <4 x i1> %cmp.i.i, <4 x i32> <i32 255, i32 255, i32 255, i32 255>, <4 x i32> %cond.i.i
  %83 = extractelement <4 x i32> %cond11.i.i, i32 0
  %conv.i.i.i.i = trunc i32 %83 to i8
  %84 = insertelement <4 x i8> undef, i8 %conv.i.i.i.i, i32 0
  %85 = extractelement <4 x i32> %cond11.i.i, i32 1
  %conv.i4.i.i.i = trunc i32 %85 to i8
  %86 = insertelement <4 x i8> %84, i8 %conv.i4.i.i.i, i32 1
  %87 = extractelement <4 x i32> %cond11.i.i, i32 2
  %conv.i.i5.i.i = trunc i32 %87 to i8
  %88 = insertelement <4 x i8> undef, i8 %conv.i.i5.i.i, i32 0
  %89 = extractelement <4 x i32> %cond11.i.i, i32 3
  %conv.i4.i7.i.i = trunc i32 %89 to i8
  %90 = insertelement <4 x i8> %88, i8 %conv.i4.i7.i.i, i32 1
  %vecinit3.i.i = shufflevector <4 x i8> %86, <4 x i8> %90, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
  %cmp162 = icmp sgt i32 %sub, -1
  br i1 %cmp162, label %land.lhs.true, label %if.end

land.lhs.true:                                    ; preds = %cond.end
  %cmp165 = icmp slt i32 %add5, %dst_cols
  %cmp168 = icmp sgt i32 %add.i454, -1
  %or.cond = and i1 %cmp165, %cmp168
  %cmp171 = icmp slt i32 %add.i454, %dst_rows
  %or.cond382 = and i1 %or.cond, %cmp171
  %cmp174 = icmp eq i32 %and, 0
  %or.cond388 = and i1 %or.cond382, %cmp174
  br i1 %or.cond388, label %if.then, label %land.lhs.true178

if.then:                                          ; preds = %land.lhs.true
  %add.ptr = getelementptr inbounds i8 addrspace(1)* %dst, i32 %add.i389
  %91 = bitcast i8 addrspace(1)* %add.ptr to <4 x i8> addrspace(1)*
  store <4 x i8> %vecinit3.i.i, <4 x i8> addrspace(1)* %91, align 4, !tbaa !10
  br label %if.end237

land.lhs.true178:                                 ; preds = %land.lhs.true
  %cmp179 = icmp slt i32 %sub, %dst_cols
  %or.cond238 = and i1 %cmp179, %cmp168
  %or.cond384 = and i1 %or.cond238, %cmp171
  br i1 %or.cond384, label %if.then187, label %if.end

if.then187:                                       ; preds = %land.lhs.true178
  %arrayidx188 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %add.i389
  store i8 %conv.i.i.i.i, i8 addrspace(1)* %arrayidx188, align 1, !tbaa !10
  br label %if.end

if.end:                                           ; preds = %if.then187, %land.lhs.true178, %cond.end
  %cmp190 = icmp sgt i32 %sub, -2
  br i1 %cmp190, label %land.lhs.true192, label %if.end204

land.lhs.true192:                                 ; preds = %if.end
  %cmp194 = icmp slt i32 %add, %dst_cols
  %cmp197 = icmp sgt i32 %add.i454, -1
  %or.cond239 = and i1 %cmp194, %cmp197
  %cmp200 = icmp slt i32 %add.i454, %dst_rows
  %or.cond385 = and i1 %or.cond239, %cmp200
  br i1 %or.cond385, label %if.then202, label %if.end204

if.then202:                                       ; preds = %land.lhs.true192
  %arrayidx203 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %inc
  store i8 %conv.i4.i.i.i, i8 addrspace(1)* %arrayidx203, align 1, !tbaa !10
  br label %if.end204

if.end204:                                        ; preds = %if.then202, %land.lhs.true192, %if.end
  %cmp206 = icmp sgt i32 %add3, -1
  br i1 %cmp206, label %land.lhs.true208, label %if.end220

land.lhs.true208:                                 ; preds = %if.end204
  %cmp210 = icmp slt i32 %add3, %dst_cols
  %cmp213 = icmp sgt i32 %add.i454, -1
  %or.cond240 = and i1 %cmp210, %cmp213
  %cmp216 = icmp slt i32 %add.i454, %dst_rows
  %or.cond386 = and i1 %or.cond240, %cmp216
  br i1 %or.cond386, label %if.then218, label %if.end220

if.then218:                                       ; preds = %land.lhs.true208
  %arrayidx219 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %add159
  store i8 %conv.i.i5.i.i, i8 addrspace(1)* %arrayidx219, align 1, !tbaa !10
  br label %if.end220

if.end220:                                        ; preds = %if.then218, %land.lhs.true208, %if.end204
  %cmp222 = icmp sgt i32 %add5, -1
  br i1 %cmp222, label %land.lhs.true224, label %if.end237

land.lhs.true224:                                 ; preds = %if.end220
  %cmp226 = icmp slt i32 %add5, %dst_cols
  %cmp229 = icmp sgt i32 %add.i454, -1
  %or.cond241 = and i1 %cmp226, %cmp229
  %cmp232 = icmp slt i32 %add.i454, %dst_rows
  %or.cond387 = and i1 %or.cond241, %cmp232
  br i1 %or.cond387, label %if.then234, label %if.end237

if.then234:                                       ; preds = %land.lhs.true224
  %arrayidx235 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %add160
  store i8 %conv.i4.i7.i.i, i8 addrspace(1)* %arrayidx235, align 1, !tbaa !10
  br label %if.end237

if.end237:                                        ; preds = %if.then234, %land.lhs.true224, %if.end220, %if.then
  ret void
}

; Function Attrs: nounwind readnone
declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1

; Function Attrs: nounwind readonly
declare <4 x float> @llvm.floor.v4f32(<4 x float>) #2

; Function Attrs: nounwind readnone
declare float @llvm.fmuladd.f32(float, float, float) #1

; Function Attrs: nounwind readonly
declare float @llvm.floor.f32(float) #2

; Function Attrs: nounwind readonly
declare <4 x float> @llvm.rint.v4f32(<4 x float>) #2

; Function Attrs: nounwind readonly
declare float @llvm.rint.f32(float) #2

; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.tgid.x() #1

; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.tgid.y() #1

; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.local.size.x() #1

; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.local.size.y() #1

; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.tidig.x() #1

; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.tidig.y() #1

attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind readonly }

!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7}
!llvm.ident = !{!8, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9}

!0 = metadata !{void (i8 addrspace(1)*, i8 addrspace(1)*, i32, i32, i32, i32, i32, i32, i32, i32, float, float)* @resizeLN_C1_D0}
!1 = metadata !{null}
!2 = metadata !{null}
!3 = metadata !{null}
!4 = metadata !{null}
!5 = metadata !{null}
!6 = metadata !{null}
!7 = metadata !{null}
!8 = metadata !{metadata !"clang version 3.4 (trunk 194633) (llvm/trunk 195029)"}
!9 = metadata !{metadata !"clang version 3.4 (trunk 194830) (llvm/trunk 194831)"}
!10 = metadata !{metadata !11, metadata !11, i64 0}
!11 = metadata !{metadata !"omnipotent char", metadata !12, i64 0}
!12 = metadata !{metadata !"Simple C/C++ TBAA"}