[PATCH] R600: Clean if/then/else handling code in AMDILCFGStructurizer
Tom Stellard
tom at stellard.net
Tue Nov 19 20:08:48 PST 2013
On Tue, Nov 19, 2013 at 05:36:42AM -0800, Vincent Lejeune wrote:
> ----- Mail original -----
>
> > De?: Tom Stellard <tom at stellard.net>
> > ??: reviews+D2188+public+3f8d5bc02a1045dd at llvm-reviews.chandlerc.com
> > Cc?: vljn at ovi.com; llvm-commits at cs.uiuc.edu
> > Envoy? le : Lundi 18 novembre 2013 21h09
> > Objet?: Re: [PATCH] R600: Clean if/then/else handling code in AMDILCFGStructurizer
> >
> > Hi Vincent,
> >
> > I've rebased your patch on top of the current master branch (which
> > includes my patches to enable the IR Structurizer), and attached it to
> > this mail.
> >
> > Your patch regresses a test case I added in:
> >
> > r195030
> > R600: Fix a crash in the AMDILCFGStrucurizer
>
> structurize1.ll works fine on my computer, however structurize.ll does not (but it's older than r195030)
> Structurize.ll actually check that branches-into-if don't crash llvm, but it run with ir structurizer disabled
> whereas I assume in my patch that it is enabled. (it will probably require some change in the CHECK: sections too)
>
OK, so the LLVM test case is working fine, but the program I derived the
test from is crashing. I will try to get the full dump for you. In the
mean time, here is a dump from another shader that is crashing with your
patch.
-Tom
> >
> > I think the problem may be that not all of the if statements nested
> > inside the loop are being matched by the ifPatternMatch() function.
> >
> > The other question I have about this patch is: Why is it safe to remove
> > the code for handling branch into IF?
>
> That's why my patch relies on the ir structurizer : the pass replaces branches into if? with predicates
> and (several) triangles cfg pattern, pretty much like you described in the comment of commit
> a4f468f245d6e6869317007c548ee4d33ad97343rev at 192813.
>
> If I understand correctly what StructurizeCFG does, it will convert pattern like this :
>
> //? ? ? ? ? ? ? ? ? ? ? ? ? ? entry
> //? ? ? ? ? ? ? ? ? ? ? /? ? ? ? ? ? ? ? |
> //? ? ? ? ? diamond_head? ? ? branch_from
> //? ? ? ? ? ? /? ? ? ? ? ?? ?? \? ? ? ? ? |
> // diamond_false? ? ? ? diamond_true
> //? ? ? ? ? ? \? ? ? ? ? ? ? ? ? /
> //? ? ? ? ? ? ? ? ? done
> //
>
> into a pattern similar to that :
>
> //? ? ? ? ? ? ? ? ?? entry
> // ? ? ??? ? ? ? ?? |? ? ? ? ? ? ? \?
> //? ? ? ? ????????? |????????? diamond_head
> //? ? ? ? ? ??????? |? ? ? ? ? ? ? ? |? ??? \? ??
> // ? ? ? ? ? ? ? ?? |? ? ? ? ? ? ? ? | ? ? ? diamond_false
> //????????????????? | ??? ? ? ? ? ?? |???? /
> //? ? ? ? ? ? ? ? ?? extra block
> //????????????????? |???????? \
> //????????????????? |??????? branch_from
> //????????????????? |?????? /
> //???????????????? extra block
> //????????????????? |???? \
> //????????????????? |??? diamond_true
> //????????????????? |??? /
> //????????????????? done
>
> with an additionnal predicate in diamond_true set to true in diamond_head's true path and at the end of branch_from.
>
> >
> > -Tom
> >
> > On Fri, Nov 15, 2013 at 09:16:13AM -0800, Vincent Lejeune wrote:
> >> Further simplify the pass and fix some shadertoy's sample crashes.
> >> The pass is no longer able to copy block in the jump into if situation and
> > rely on structurizeCFG pass.
> >>
> >> http://llvm-reviews.chandlerc.com/D2188
> >>
> >> Files:
> >> ? lib/Target/R600/AMDILCFGStructurizer.cpp
> >
> >> Index: lib/Target/R600/AMDILCFGStructurizer.cpp
> >> ===================================================================
> >> --- lib/Target/R600/AMDILCFGStructurizer.cpp
> >> +++ lib/Target/R600/AMDILCFGStructurizer.cpp
> >> @@ -133,18 +133,15 @@
> >> ?
> >> ? ? AMDGPUCFGStructurizer(TargetMachine &tm) :
> >> ? ? ? ? MachineFunctionPass(ID), TM(tm),
> >> -? ? ? TII(static_cast<const R600InstrInfo *>(tm.getInstrInfo())),
> >> -? ? ? TRI(&TII->getRegisterInfo()) { }
> >> +? ? ? MLI(0), TII(0), TRI(0), FuncRep(0) { }
> >> ?
> >> ? ? const char *getPassName() const {
> >> ? ? ? return "AMD IL Control Flow Graph structurizer Pass";
> >> ? ? }
> >> ?
> >> ? ? void getAnalysisUsage(AnalysisUsage &AU) const {
> >> ? ? ? AU.addPreserved<MachineFunctionAnalysis>();
> >> ? ? ? AU.addRequired<MachineFunctionAnalysis>();
> >> -? ? AU.addRequired<MachineDominatorTree>();
> >> -? ? AU.addRequired<MachinePostDominatorTree>();
> >> ? ? ? AU.addRequired<MachineLoopInfo>();
> >> ? ? }
> >> ?
> >> @@ -161,21 +158,17 @@
> >> ? ? ? OrderedBlks.clear();
> >> ? ? ? FuncRep = &MF;
> >> ? ? ? MLI = &getAnalysis<MachineLoopInfo>();
> >> +? ? TII = static_cast<const R600InstrInfo *>(TM.getInstrInfo());
> >> +? ? TRI = &TII->getRegisterInfo();
> >> ? ? ? DEBUG(dbgs() << "LoopInfo:\n";
> > PrintLoopinfo(*MLI););
> >> -? ? MDT = &getAnalysis<MachineDominatorTree>();
> >> -? ? DEBUG(MDT->print(dbgs(), (const llvm::Module*)0););
> >> -? ? PDT = &getAnalysis<MachinePostDominatorTree>();
> >> -? ? DEBUG(PDT->print(dbgs()););
> >> ? ? ? prepare();
> >> ? ? ? run();
> >> ? ? ? DEBUG(MF.dump(););
> >> ? ? ? return true;
> >> ? ? }
> >> ?
> >> ? protected:
> >> ? ? TargetMachine &TM;
> >> -? MachineDominatorTree *MDT;
> >> -? MachinePostDominatorTree *PDT;
> >> ? ? MachineLoopInfo *MLI;
> >> ? ? const R600InstrInfo *TII;
> >> ? ? const AMDGPURegisterInfo *TRI;
> >> @@ -208,12 +201,8 @@
> >> ? ? bool hasBackEdge(MachineBasicBlock *MBB) const;
> >> ? ? static unsigned getLoopDepth(MachineLoop *LoopRep);
> >> ? ? bool isRetiredBlock(MachineBasicBlock *MBB) const;
> >> -? bool isActiveLoophead(MachineBasicBlock *MBB) const;
> >> -? PathToKind singlePathTo(MachineBasicBlock *SrcMBB, MachineBasicBlock
> > *DstMBB,
> >> -? ? ? bool AllowSideEntry = true) const;
> >> ? ? int countActiveBlock(MBBVector::const_iterator It,
> >> ? ? ? ? MBBVector::const_iterator E) const;
> >> -? bool needMigrateBlock(MachineBasicBlock *MBB) const;
> >> ?
> >> ? ? // Utility Functions
> >> ? ? void reversePredicateSetter(MachineBasicBlock::iterator I);
> >> @@ -264,33 +253,16 @@
> >> ?
> >> ?
> >> ? ? int patternMatch(MachineBasicBlock *MBB);
> >> -? int patternMatchGroup(MachineBasicBlock *MBB);
> >> -? int serialPatternMatch(MachineBasicBlock *MBB);
> >> -? int ifPatternMatch(MachineBasicBlock *MBB);
> >> +? bool patternMatchGroup(MachineBasicBlock *MBB);
> >> +? bool serialPatternMatch(MachineBasicBlock *MBB);
> >> +? bool ifPatternMatch(MachineBasicBlock *MBB);
> >> ? ? int loopendPatternMatch();
> >> ? ? int mergeLoop(MachineLoop *LoopRep);
> >> ? ? int loopcontPatternMatch(MachineLoop *LoopRep, MachineBasicBlock
> > *LoopHeader);
> >> ?
> >> ? ? void handleLoopcontBlock(MachineBasicBlock *ContingMBB,
> >> ? ? ? ? MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
> >> ? ? ? ? MachineLoop *ContLoop);
> >> -? /// return true iff src1Blk->succ_size() == 0 && src1Blk and
> > src2Blk are in
> >> -? /// the same loop with LoopLandInfo without explicitly keeping track of
> >> -? /// loopContBlks and loopBreakBlks, this is a method to get the
> > information.
> >> -? bool isSameloopDetachedContbreak(MachineBasicBlock *Src1MBB,
> >> -? ? ? MachineBasicBlock *Src2MBB);
> >> -? int handleJumpintoIf(MachineBasicBlock *HeadMBB,
> >> -? ? ? MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
> >> -? int handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
> >> -? ? ? MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB);
> >> -? int improveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
> >> -? ? ? MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
> >> -? ? ? MachineBasicBlock **LandMBBPtr);
> >> -? void showImproveSimpleJumpintoIf(MachineBasicBlock *HeadMBB,
> >> -? ? ? MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
> >> -? ? ? MachineBasicBlock *LandMBB, bool Detail = false);
> >> -? int cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
> >> -? ? ? MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB);
> >> ? ? void mergeSerialBlock(MachineBasicBlock *DstMBB,
> >> ? ? ? ? MachineBasicBlock *SrcMBB);
> >> ?
> >> @@ -326,18 +298,10 @@
> >> ? ? void removeSuccessor(MachineBasicBlock *MBB);
> >> ? ? MachineBasicBlock *cloneBlockForPredecessor(MachineBasicBlock *MBB,
> >> ? ? ? ? MachineBasicBlock *PredMBB);
> >> -? void migrateInstruction(MachineBasicBlock *SrcMBB,
> >> -? ? ? MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I);
> >> ? ? void recordSccnum(MachineBasicBlock *MBB, int SCCNum);
> >> ? ? void retireBlock(MachineBasicBlock *MBB);
> >> ? ? void setLoopLandBlock(MachineLoop *LoopRep, MachineBasicBlock *MBB =
> > NULL);
> >> ?
> >> -? MachineBasicBlock
> > *findNearestCommonPostDom(std::set<MachineBasicBlock *>&);
> >> -? /// This is work around solution for findNearestCommonDominator not
> > avaiable
> >> -? /// to post dom a proper fix should go to Dominators.h.
> >> -? MachineBasicBlock *findNearestCommonPostDom(MachineBasicBlock *MBB1,
> >> -? ? ? MachineBasicBlock *MBB2);
> >> -
> >> ? private:
> >> ? ? MBBInfoMap BlockInfoMap;
> >> ? ? LoopLandInfoMap LLInfoMap;
> >> @@ -380,36 +344,6 @@
> >> ? ? return (*It).second->IsRetired;
> >> ? }
> >> ?
> >> -bool AMDGPUCFGStructurizer::isActiveLoophead(MachineBasicBlock *MBB) const
> > {
> >> -? MachineLoop *LoopRep = MLI->getLoopFor(MBB);
> >> -? while (LoopRep && LoopRep->getHeader() == MBB) {
> >> -? ? MachineBasicBlock *LoopLand = getLoopLandInfo(LoopRep);
> >> -? ? if(!LoopLand)
> >> -? ? ? return true;
> >> -? ? if (!isRetiredBlock(LoopLand))
> >> -? ? ? return true;
> >> -? ? LoopRep = LoopRep->getParentLoop();
> >> -? }
> >> -? return false;
> >> -}
> >> -AMDGPUCFGStructurizer::PathToKind AMDGPUCFGStructurizer::singlePathTo(
> >> -? ? MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB,
> >> -? ? bool AllowSideEntry) const {
> >> -? assert(DstMBB);
> >> -? if (SrcMBB == DstMBB)
> >> -? ? return SinglePath_InPath;
> >> -? while (SrcMBB && SrcMBB->succ_size() == 1) {
> >> -? ? SrcMBB = *SrcMBB->succ_begin();
> >> -? ? if (SrcMBB == DstMBB)
> >> -? ? ? return SinglePath_InPath;
> >> -? ? if (!AllowSideEntry && SrcMBB->pred_size() > 1)
> >> -? ? ? return Not_SinglePath;
> >> -? }
> >> -? if (SrcMBB && SrcMBB->succ_size()==0)
> >> -? ? return SinglePath_NotInPath;
> >> -? return Not_SinglePath;
> >> -}
> >> -
> >> ? int AMDGPUCFGStructurizer::countActiveBlock(MBBVector::const_iterator It,
> >> ? ? ? MBBVector::const_iterator E) const {
> >> ? ? int Count = 0;
> >> @@ -421,18 +355,6 @@
> >> ? ? return Count;
> >> ? }
> >> ?
> >> -bool AMDGPUCFGStructurizer::needMigrateBlock(MachineBasicBlock *MBB) const
> > {
> >> -? unsigned BlockSizeThreshold = 30;
> >> -? unsigned CloneInstrThreshold = 100;
> >> -? bool MultiplePreds = MBB && (MBB->pred_size() > 1);
> >> -
> >> -? if(!MultiplePreds)
> >> -? ? return false;
> >> -? unsigned BlkSize = MBB->size();
> >> -? return ((BlkSize > BlockSizeThreshold) &&
> >> -? ? ? (BlkSize * (MBB->pred_size() - 1) > CloneInstrThreshold));
> >> -}
> >> -
> >> ? void AMDGPUCFGStructurizer::reversePredicateSetter(
> >> ? ? ? MachineBasicBlock::iterator I) {
> >> ? ? while (I--) {
> >> @@ -800,6 +722,7 @@
> >> ? ? bool MakeProgress = false;
> >> ? ? int NumRemainedBlk = countActiveBlock(OrderedBlks.begin(),
> >> ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? OrderedBlks.end());
> >> +? loopendPatternMatch();
> >> ?
> >> ? ? do {
> >> ? ? ? ++NumIter;
> >> @@ -972,103 +895,96 @@
> >> ? ? return NumMatch;
> >> ? }
> >> ?
> >> -int AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB)
> > {
> >> -? int NumMatch = 0;
> >> -? NumMatch += loopendPatternMatch();
> >> -? NumMatch += serialPatternMatch(MBB);
> >> -? NumMatch += ifPatternMatch(MBB);
> >> -? return NumMatch;
> >> +bool AMDGPUCFGStructurizer::patternMatchGroup(MachineBasicBlock *MBB)
> > {
> >> +? bool ChangedInSingleIteration, Changed = false;
> >> +? do {
> >> +? ? ? ChangedInSingleIteration = false;
> >> +? ? ? DEBUG(dbgs() << "Pattern matching starting from BB#"
> > << MBB->getNumber()
> >> +? ? ? ? ? << "\n";);
> >> +? ? ? ChangedInSingleIteration |= serialPatternMatch(MBB);
> >> +? ? ? ChangedInSingleIteration |= ifPatternMatch(MBB);
> >> +? ? ? Changed |= ChangedInSingleIteration;
> >> +? } while (ChangedInSingleIteration);
> >> +
> >> +? return Changed;
> >> ? }
> >> ?
> >> ?
> >> -int AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB)
> > {
> >> +bool AMDGPUCFGStructurizer::serialPatternMatch(MachineBasicBlock *MBB)
> > {
> >> ? ? if (MBB->succ_size() != 1)
> >> -? ? return 0;
> >> +? ? return false;
> >> ?
> >> ? ? MachineBasicBlock *childBlk = *MBB->succ_begin();
> >> -? if (childBlk->pred_size() != 1 || isActiveLoophead(childBlk))
> >> -? ? return 0;
> >> +? if (childBlk->pred_size() != 1)
> >> +? ? return false;
> >> ?
> >> ? ? mergeSerialBlock(MBB, childBlk);
> >> ? ? ++numSerialPatternMatch;
> >> -? return 1;
> >> +? return true;
> >> ? }
> >> ?
> >> -int AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
> >> -? //two edges
> >> +bool AMDGPUCFGStructurizer::ifPatternMatch(MachineBasicBlock *MBB) {
> >> ? ? if (MBB->succ_size() != 2)
> >> -? ? return 0;
> >> +? ? return false;
> >> ? ? if (hasBackEdge(MBB))
> >> -? ? return 0;
> >> +? ? return false;
> >> ? ? MachineInstr *BranchMI = getNormalBlockBranchInstr(MBB);
> >> ? ? if (!BranchMI)
> >> -? ? return 0;
> >> +? ? return false;
> >> ?
> >> ? ? assert(isCondBranch(BranchMI));
> >> ?
> >> +? bool Changed = false;
> >> ? ? MachineBasicBlock *TrueMBB = getTrueBranch(BranchMI);
> >> -? serialPatternMatch(TrueMBB);
> >> -? ifPatternMatch(TrueMBB);
> >> ? ? MachineBasicBlock *FalseMBB = getFalseBranch(MBB, BranchMI);
> >> -? serialPatternMatch(FalseMBB);
> >> -? ifPatternMatch(FalseMBB);
> >> +? DEBUG(dbgs() << "if pattern starting at BB#" <<
> > MBB->getNumber() <<
> >> +? ? ? ", True branch to BB#" << TrueMBB->getNumber()
> > <<
> >> +? ? ? ", False branch to BB#" << FalseMBB->getNumber()
> > << "\n");
> >> +
> >> +? Changed |= patternMatchGroup(TrueMBB);
> >> +? Changed |= patternMatchGroup(FalseMBB);
> >> +? DEBUG(
> >> +? ? dbgs() << "BB#" << TrueMBB->getNumber()
> > << "'successors :";
> >> +? ? for (MachineBasicBlock::succ_iterator I = TrueMBB->succ_begin(),
> >> +? ? ? ? E = TrueMBB->succ_end(); I != E; ++I)
> >> +? ? ? dbgs() << "BB#" << (*I)->getNumber()
> > <<", ";
> >> +? ? dbgs() << "\n";
> >> +? ? dbgs() << "BB#" << FalseMBB->getNumber()
> > << "'successors :";
> >> +? ? for (MachineBasicBlock::succ_iterator I = FalseMBB->succ_begin(),
> >> +? ? ? ? E = FalseMBB->succ_end(); I != E; ++I)
> >> +? ? ? dbgs() << "BB#" << (*I)->getNumber()
> > << ", ";
> >> +? ? dbgs() << "\n";
> >> +);
> >> ? ? MachineBasicBlock *LandBlk;
> >> -? int Cloned = 0;
> >> ?
> >> ? ? assert (!TrueMBB->succ_empty() || !FalseMBB->succ_empty());
> >> -? // TODO: Simplify
> >> ? ? if (TrueMBB->succ_size() == 1 && FalseMBB->succ_size() ==
> > 1
> >> -? ? && *TrueMBB->succ_begin() == *FalseMBB->succ_begin())
> > {
> >> +? ? && *TrueMBB->succ_begin() == *FalseMBB->succ_begin()
> >> +? ? && TrueMBB->pred_size() == 1 &&
> > FalseMBB->pred_size() == 1) {
> >> ? ? ? // Diamond pattern
> >> ? ? ? LandBlk = *TrueMBB->succ_begin();
> >> -? } else if (TrueMBB->succ_size() == 1 &&
> > *TrueMBB->succ_begin() == FalseMBB) {
> >> +? } else if (TrueMBB->succ_size() == 1 &&
> > *TrueMBB->succ_begin() == FalseMBB &&
> >> +? ? ? TrueMBB->pred_size() == 1) {
> >> ? ? ? // Triangle pattern, false is empty
> >> ? ? ? LandBlk = FalseMBB;
> >> ? ? ? FalseMBB = NULL;
> >> -? } else if (FalseMBB->succ_size() == 1
> >> +? } else if (FalseMBB->succ_size() == 1 &&
> > FalseMBB->pred_size() == 1
> >> ? ? ? ? ? ? ? && *FalseMBB->succ_begin() == TrueMBB) {
> >> ? ? ? // Triangle pattern, true is empty
> >> ? ? ? // We reverse the predicate to make a triangle, empty false pattern;
> >> ? ? ? std::swap(TrueMBB, FalseMBB);
> >> ? ? ? reversePredicateSetter(MBB->end());
> >> ? ? ? LandBlk = FalseMBB;
> >> ? ? ? FalseMBB = NULL;
> >> -? } else if (FalseMBB->succ_size() == 1
> >> -? ? ? ? ? ? && isSameloopDetachedContbreak(TrueMBB, FalseMBB))
> > {
> >> -? ? LandBlk = *FalseMBB->succ_begin();
> >> -? } else if (TrueMBB->succ_size() == 1
> >> -? ? && isSameloopDetachedContbreak(FalseMBB, TrueMBB)) {
> >> -? ? LandBlk = *TrueMBB->succ_begin();
> >> -? } else {
> >> -? ? return handleJumpintoIf(MBB, TrueMBB, FalseMBB);
> >> -? }
> >> -
> >> -? // improveSimpleJumpinfoIf can handle the case where landBlk == NULL but
> > the
> >> -? // new BB created for landBlk==NULL may introduce new challenge to the
> >> -? // reduction process.
> >> -? if (LandBlk &&
> >> -? ? ? ((TrueMBB && TrueMBB->pred_size() > 1)
> >> -? ? ? || (FalseMBB && FalseMBB->pred_size() > 1))) {
> >> -? ? Cloned += improveSimpleJumpintoIf(MBB, TrueMBB, FalseMBB,
> > &LandBlk);
> >> -? }
> >> +? } else
> >> +? ? return Changed;
> >> ?
> >> -? if (TrueMBB && TrueMBB->pred_size() > 1) {
> >> +? if (TrueMBB && TrueMBB->pred_size() > 1)
> >> ? ? ? TrueMBB = cloneBlockForPredecessor(TrueMBB, MBB);
> >> -? ? ++Cloned;
> >> -? }
> >> -
> >> -? if (FalseMBB && FalseMBB->pred_size() > 1) {
> >> +? if (FalseMBB && FalseMBB->pred_size() > 1)
> >> ? ? ? FalseMBB = cloneBlockForPredecessor(FalseMBB, MBB);
> >> -? ? ++Cloned;
> >> -? }
> >> -
> >> ? ? mergeIfthenelseBlock(BranchMI, MBB, TrueMBB, FalseMBB, LandBlk);
> >> -
> >> -? ++numIfPatternMatch;
> >> -
> >> -? numClonedBlock += Cloned;
> >> -
> >> -? return 1 + Cloned;
> >> +? return true;
> >> ? }
> >> ?
> >> ? int AMDGPUCFGStructurizer::loopendPatternMatch() {
> >> @@ -1129,11 +1045,7 @@
> >> ? ? for (unsigned i = 0, e = LatchBlks.size(); i < e; ++i)
> >> ? ? ? settleLoopcontBlock(LatchBlks[i], LoopHeader);
> >> ? ? int Match = 0;
> >> -? do {
> >> -? ? Match = 0;
> >> -? ? Match += serialPatternMatch(LoopHeader);
> >> -? ? Match += ifPatternMatch(LoopHeader);
> >> -? } while (Match > 0);
> >> +? patternMatchGroup(LoopHeader);
> >> ? ? mergeLooplandBlock(LoopHeader, ExitBlk);
> >> ? ? MachineLoop *ParentLoop = LoopRep->getParentLoop();
> >> ? ? if (ParentLoop)
> >> @@ -1171,302 +1083,6 @@
> >> ? ? return NumCont;
> >> ? }
> >> ?
> >> -
> >> -bool AMDGPUCFGStructurizer::isSameloopDetachedContbreak(
> >> -? ? MachineBasicBlock *Src1MBB, MachineBasicBlock *Src2MBB) {
> >> -? if (Src1MBB->succ_size() == 0) {
> >> -? ? MachineLoop *LoopRep = MLI->getLoopFor(Src1MBB);
> >> -? ? if (LoopRep&& LoopRep == MLI->getLoopFor(Src2MBB)) {
> >> -? ? ? MachineBasicBlock *&TheEntry = LLInfoMap[LoopRep];
> >> -? ? ? if (TheEntry) {
> >> -? ? ? ? DEBUG(
> >> -? ? ? ? ? dbgs() << "isLoopContBreakBlock yes src1 = BB"
> >> -? ? ? ? ? ? ? ? << Src1MBB->getNumber()
> >> -? ? ? ? ? ? ? ? << " src2 = BB" <<
> > Src2MBB->getNumber() << "\n";
> >> -? ? ? ? );
> >> -? ? ? ? return true;
> >> -? ? ? }
> >> -? ? }
> >> -? }
> >> -? return false;
> >> -}
> >> -
> >> -int AMDGPUCFGStructurizer::handleJumpintoIf(MachineBasicBlock *HeadMBB,
> >> -? ? MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
> >> -? int Num = handleJumpintoIfImp(HeadMBB, TrueMBB, FalseMBB);
> >> -? if (Num == 0) {
> >> -? ? DEBUG(
> >> -? ? ? dbgs() << "handleJumpintoIf swap trueBlk and
> > FalseBlk" << "\n";
> >> -? ? );
> >> -? ? Num = handleJumpintoIfImp(HeadMBB, FalseMBB, TrueMBB);
> >> -? }
> >> -? return Num;
> >> -}
> >> -
> >> -int AMDGPUCFGStructurizer::handleJumpintoIfImp(MachineBasicBlock *HeadMBB,
> >> -? ? MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB) {
> >> -? int Num = 0;
> >> -? MachineBasicBlock *DownBlk;
> >> -
> >> -? //trueBlk could be the common post dominator
> >> -? DownBlk = TrueMBB;
> >> -
> >> -? DEBUG(
> >> -? ? dbgs() << "handleJumpintoIfImp head = BB" <<
> > HeadMBB->getNumber()
> >> -? ? ? ? ? << " true = BB" <<
> > TrueMBB->getNumber()
> >> -? ? ? ? ? << ", numSucc=" <<
> > TrueMBB->succ_size()
> >> -? ? ? ? ? << " false = BB" <<
> > FalseMBB->getNumber() << "\n";
> >> -? );
> >> -
> >> -? while (DownBlk) {
> >> -? ? DEBUG(
> >> -? ? ? dbgs() << "check down = BB" <<
> > DownBlk->getNumber();
> >> -? ? );
> >> -
> >> -? ? if (singlePathTo(FalseMBB, DownBlk) == SinglePath_InPath) {
> >> -? ? ? DEBUG(
> >> -? ? ? ? dbgs() << " working\n";
> >> -? ? ? );
> >> -
> >> -? ? ? Num += cloneOnSideEntryTo(HeadMBB, TrueMBB, DownBlk);
> >> -? ? ? Num += cloneOnSideEntryTo(HeadMBB, FalseMBB, DownBlk);
> >> -
> >> -? ? ? numClonedBlock += Num;
> >> -? ? ? Num += serialPatternMatch(*HeadMBB->succ_begin());
> >> -? ? ? Num += serialPatternMatch(*llvm::next(HeadMBB->succ_begin()));
> >> -? ? ? Num += ifPatternMatch(HeadMBB);
> >> -? ? ? assert(Num > 0);
> >> -
> >> -? ? ? break;
> >> -? ? }
> >> -? ? DEBUG(
> >> -? ? ? dbgs() << " not working\n";
> >> -? ? );
> >> -? ? DownBlk = (DownBlk->succ_size() == 1) ? (*DownBlk->succ_begin())
> > : NULL;
> >> -? } // walk down the postDomTree
> >> -
> >> -? return Num;
> >> -}
> >> -
> >> -void AMDGPUCFGStructurizer::showImproveSimpleJumpintoIf(
> >> -? ? MachineBasicBlock *HeadMBB, MachineBasicBlock *TrueMBB,
> >> -? ? MachineBasicBlock *FalseMBB, MachineBasicBlock *LandMBB, bool Detail)
> > {
> >> -? dbgs() << "head = BB" << HeadMBB->getNumber()
> >> -? ? ? ? << " size = " << HeadMBB->size();
> >> -? if (Detail) {
> >> -? ? dbgs() << "\n";
> >> -? ? HeadMBB->print(dbgs());
> >> -? ? dbgs() << "\n";
> >> -? }
> >> -
> >> -? if (TrueMBB) {
> >> -? ? dbgs() << ", true = BB" <<
> > TrueMBB->getNumber() << " size = "
> >> -? ? ? ? ? << TrueMBB->size() << " numPred = "
> > << TrueMBB->pred_size();
> >> -? ? if (Detail) {
> >> -? ? ? dbgs() << "\n";
> >> -? ? ? TrueMBB->print(dbgs());
> >> -? ? ? dbgs() << "\n";
> >> -? ? }
> >> -? }
> >> -? if (FalseMBB) {
> >> -? ? dbgs() << ", false = BB" <<
> > FalseMBB->getNumber() << " size = "
> >> -? ? ? ? ? << FalseMBB->size() << " numPred = "
> > << FalseMBB->pred_size();
> >> -? ? if (Detail) {
> >> -? ? ? dbgs() << "\n";
> >> -? ? ? FalseMBB->print(dbgs());
> >> -? ? ? dbgs() << "\n";
> >> -? ? }
> >> -? }
> >> -? if (LandMBB) {
> >> -? ? dbgs() << ", land = BB" <<
> > LandMBB->getNumber() << " size = "
> >> -? ? ? ? ? << LandMBB->size() << " numPred = "
> > << LandMBB->pred_size();
> >> -? ? if (Detail) {
> >> -? ? ? dbgs() << "\n";
> >> -? ? ? LandMBB->print(dbgs());
> >> -? ? ? dbgs() << "\n";
> >> -? ? }
> >> -? }
> >> -
> >> -? ? dbgs() << "\n";
> >> -}
> >> -
> >> -int AMDGPUCFGStructurizer::improveSimpleJumpintoIf(MachineBasicBlock
> > *HeadMBB,
> >> -? ? MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
> >> -? ? MachineBasicBlock **LandMBBPtr) {
> >> -? bool MigrateTrue = false;
> >> -? bool MigrateFalse = false;
> >> -
> >> -? MachineBasicBlock *LandBlk = *LandMBBPtr;
> >> -
> >> -? assert((!TrueMBB || TrueMBB->succ_size() <= 1)
> >> -? ? ? ? && (!FalseMBB || FalseMBB->succ_size() <= 1));
> >> -
> >> -? if (TrueMBB == FalseMBB)
> >> -? ? return 0;
> >> -
> >> -? MigrateTrue = needMigrateBlock(TrueMBB);
> >> -? MigrateFalse = needMigrateBlock(FalseMBB);
> >> -
> >> -? if (!MigrateTrue && !MigrateFalse)
> >> -? ? return 0;
> >> -
> >> -? // If we need to migrate either trueBlk and falseBlk, migrate the rest
> > that
> >> -? // have more than one predecessors.? without doing this, its predecessor
> >> -? // rather than headBlk will have undefined value in initReg.
> >> -? if (!MigrateTrue && TrueMBB && TrueMBB->pred_size()
> >> 1)
> >> -? ? MigrateTrue = true;
> >> -? if (!MigrateFalse && FalseMBB &&
> > FalseMBB->pred_size() > 1)
> >> -? ? MigrateFalse = true;
> >> -
> >> -? DEBUG(
> >> -? ? dbgs() << "before improveSimpleJumpintoIf: ";
> >> -? ? showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
> >> -? );
> >> -
> >> -? // org: headBlk => if () {trueBlk} else {falseBlk} =>
> > landBlk
> >> -? //
> >> -? // new: headBlk => if () {initReg = 1; org trueBlk branch} else
> >> -? //? ? ? {initReg = 0; org falseBlk branch }
> >> -? //? ? ? => landBlk => if (initReg) {org trueBlk} else
> > {org falseBlk}
> >> -? //? ? ? => org landBlk
> >> -? //? ? ? if landBlk->pred_size() > 2, put the about if-else inside
> >> -? //? ? ? if (initReg !=2) {...}
> >> -? //
> >> -? // add initReg = initVal to headBlk
> >> -
> >> -? const TargetRegisterClass * I32RC =
> > TRI->getCFGStructurizerRegClass(MVT::i32);
> >> -? if (!MigrateTrue || !MigrateFalse) {
> >> -? ? // XXX: We have an opportunity here to optimize the "branch into
> > if" case
> >> -? ? // here.? Branch into if looks like this:
> >> -? ? //? ? ? ? ? ? ? ? ? ? ? ? entry
> >> -? ? //? ? ? ? ? ? ? ? ? ? ? /? ? |
> >> -? ? //? ? ? ? ? diamond_head? ? ? branch_from
> >> -? ? //? ? ? ? ? ? /? ? ? \? ? ? ? ? |
> >> -? ? // diamond_false? ? ? ? diamond_true
> >> -? ? //? ? ? ? ? ? \? ? ? /
> >> -? ? //? ? ? ? ? ? ? done
> >> -? ? //
> >> -? ? // The diamond_head block begins the "if" and the
> > diamond_true block
> >> -? ? // is the block being "branched into".
> >> -? ? //
> >> -? ? // If MigrateTrue is true, then TrueBB is the block being
> > "branched into"
> >> -? ? // and if MigrateFalse is true, then FalseBB is the block being
> >> -? ? // "branched into"
> >> -? ? //
> >> -? ? // Here is the pseudo code for how I think the optimization should
> > work:
> >> -? ? // 1. Insert MOV GPR0, 0 before the branch instruction in
> > diamond_head.
> >> -? ? // 2. Insert MOV GPR0, 1 before the branch instruction in branch_from.
> >> -? ? // 3. Move the branch instruction from diamond_head into its own basic
> >> -? ? //? ? block (new_block).
> >> -? ? // 4. Add an unconditional branch from diamond_head to new_block
> >> -? ? // 5. Replace the branch instruction in branch_from with an
> > unconditional
> >> -? ? //? ? branch to new_block.? If branch_from has multiple predecessors,
> > then
> >> -? ? //? ? we need to replace the True/False block in the branch
> >> -? ? //? ? instruction instead of replacing it.
> >> -? ? // 6. Change the condition of the branch instruction in new_block from
> >> -? ? //? ? COND to (COND || GPR0)
> >> -? ? //
> >> -? ? // In order insert these MOV instruction, we will need to use the
> >> -? ? // RegisterScavenger.? Usually liveness stops being tracked during
> >> -? ? // the late machine optimization passes, however if we implement
> >> -? ? // bool TargetRegisterInfo::requiresRegisterScavenging(
> >> -? ? //? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? ? const
> > MachineFunction &MF)
> >> -? ? // and have it return true, liveness will be tracked correctly
> >> -? ? // by generic optimization passes.? We will also need to make sure
> > that
> >> -? ? // all of our target-specific passes that run after regalloc and
> > before
> >> -? ? // the CFGStructurizer track liveness and we will need to modify this
> > pass
> >> -? ? // to correctly track liveness.
> >> -? ? //
> >> -? ? // After the above changes, the new CFG should look like this:
> >> -? ? //? ? ? ? ? ? ? ? ? ? ? ? entry
> >> -? ? //? ? ? ? ? ? ? ? ? ? ? /? ? |
> >> -? ? //? ? ? ? ? diamond_head? ? ? branch_from
> >> -? ? //? ? ? ? ? ? ? ? ? ? ? \? ? /
> >> -? ? //? ? ? ? ? ? ? ? ? ? ? new_block
> >> -? ? //? ? ? ? ? ? ? ? ? ? ? /? ? ? |
> >> -? ? //? ? ? ? diamond_false? ? ? ? diamond_true
> >> -? ? //? ? ? ? ? ? ? ? ? ? ? \? ? ? /
> >> -? ? //? ? ? ? ? ? ? ? ? ? ? ? done
> >> -? ? //
> >> -? ? // Without this optimization, we are forced to duplicate the
> > diamond_true
> >> -? ? // block and we will end up with a CFG like this:
> >> -? ? //
> >> -? ? //? ? ? ? ? ? ? ? ? ? ? ? entry
> >> -? ? //? ? ? ? ? ? ? ? ? ? ? /? ? |
> >> -? ? //? ? ? ? ? diamond_head? ? ? branch_from
> >> -? ? //? ? ? ? ? ? /? ? ? \? ? ? ? ? ? ? ? ? |
> >> -? ? // diamond_false? ? ? ? diamond_true? ? ? diamond_true (duplicate)
> >> -? ? //? ? ? ? ? ? \? ? ? /? ? ? ? ? ? ? ? ? |
> >> -? ? //? ? ? ? ? ? ? done --------------------|
> >> -? ? //
> >> -? ? // Duplicating diamond_true can be very costly especially if it has a
> >> -? ? // lot of instructions.
> >> -? ? return 0;
> >> -? }
> >> -
> >> -? int NumNewBlk = 0;
> >> -
> >> -? bool LandBlkHasOtherPred = (LandBlk->pred_size() > 2);
> >> -
> >> -? //insert AMDGPU::ENDIF to avoid special case "input landBlk ==
> > NULL"
> >> -? MachineBasicBlock::iterator I = insertInstrBefore(LandBlk,
> > AMDGPU::ENDIF);
> >> -
> >> -? if (LandBlkHasOtherPred) {
> >> -? ? llvm_unreachable("Extra register needed to handle CFG");
> >> -? ? unsigned CmpResReg =
> >> -? ? ?
> > HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
> >> -? ? llvm_unreachable("Extra compare instruction needed to handle
> > CFG");
> >> -? ? insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET,
> >> -? ? ? ? CmpResReg, DebugLoc());
> >> -? }
> >> -
> >> -? // XXX: We are running this after RA, so creating virtual registers will
> >> -? // cause an assertion failure in the PostRA scheduling pass.
> >> -? unsigned InitReg =
> >> -? ? HeadMBB->getParent()->getRegInfo().createVirtualRegister(I32RC);
> >> -? insertCondBranchBefore(LandBlk, I, AMDGPU::IF_PREDICATE_SET, InitReg,
> >> -? ? ? DebugLoc());
> >> -
> >> -? if (MigrateTrue) {
> >> -? ? migrateInstruction(TrueMBB, LandBlk, I);
> >> -? ? // need to uncondionally insert the assignment to ensure a path from
> > its
> >> -? ? // predecessor rather than headBlk has valid value in initReg if
> >> -? ? // (initVal != 1).
> >> -? ? llvm_unreachable("Extra register needed to handle CFG");
> >> -? }
> >> -? insertInstrBefore(I, AMDGPU::ELSE);
> >> -
> >> -? if (MigrateFalse) {
> >> -? ? migrateInstruction(FalseMBB, LandBlk, I);
> >> -? ? // need to uncondionally insert the assignment to ensure a path from
> > its
> >> -? ? // predecessor rather than headBlk has valid value in initReg if
> >> -? ? // (initVal != 0)
> >> -? ? llvm_unreachable("Extra register needed to handle CFG");
> >> -? }
> >> -
> >> -? if (LandBlkHasOtherPred) {
> >> -? ? // add endif
> >> -? ? insertInstrBefore(I, AMDGPU::ENDIF);
> >> -
> >> -? ? // put initReg = 2 to other predecessors of landBlk
> >> -? ? for (MachineBasicBlock::pred_iterator PI = LandBlk->pred_begin(),
> >> -? ? ? ? PE = LandBlk->pred_end(); PI != PE; ++PI) {
> >> -? ? ? MachineBasicBlock *MBB = *PI;
> >> -? ? ? if (MBB != TrueMBB && MBB != FalseMBB)
> >> -? ? ? ? llvm_unreachable("Extra register needed to handle CFG");
> >> -? ? }
> >> -? }
> >> -? DEBUG(
> >> -? ? dbgs() << "result from improveSimpleJumpintoIf: ";
> >> -? ? showImproveSimpleJumpintoIf(HeadMBB, TrueMBB, FalseMBB, LandBlk, 0);
> >> -? );
> >> -
> >> -? // update landBlk
> >> -? *LandMBBPtr = LandBlk;
> >> -
> >> -? return NumNewBlk;
> >> -}
> >> -
> >> ? void AMDGPUCFGStructurizer::handleLoopcontBlock(MachineBasicBlock
> > *ContingMBB,
> >> ? ? ? MachineLoop *ContingLoop, MachineBasicBlock *ContMBB,
> >> ? ? ? MachineLoop *ContLoop) {
> >> @@ -1637,24 +1253,6 @@
> >> ? ? }
> >> ? }
> >> ?
> >> -int AMDGPUCFGStructurizer::cloneOnSideEntryTo(MachineBasicBlock *PreMBB,
> >> -? ? MachineBasicBlock *SrcMBB, MachineBasicBlock *DstMBB) {
> >> -? int Cloned = 0;
> >> -? assert(PreMBB->isSuccessor(SrcMBB));
> >> -? while (SrcMBB && SrcMBB != DstMBB) {
> >> -? ? assert(SrcMBB->succ_size() == 1);
> >> -? ? if (SrcMBB->pred_size() > 1) {
> >> -? ? ? SrcMBB = cloneBlockForPredecessor(SrcMBB, PreMBB);
> >> -? ? ? ++Cloned;
> >> -? ? }
> >> -
> >> -? ? PreMBB = SrcMBB;
> >> -? ? SrcMBB = *SrcMBB->succ_begin();
> >> -? }
> >> -
> >> -? return Cloned;
> >> -}
> >> -
> >> ? MachineBasicBlock *
> >> ? AMDGPUCFGStructurizer::cloneBlockForPredecessor(MachineBasicBlock *MBB,
> >> ? ? ? MachineBasicBlock *PredMBB) {
> >> @@ -1683,37 +1281,6 @@
> >> ? ? return CloneMBB;
> >> ? }
> >> ?
> >> -void AMDGPUCFGStructurizer::migrateInstruction(MachineBasicBlock *SrcMBB,
> >> -? ? MachineBasicBlock *DstMBB, MachineBasicBlock::iterator I) {
> >> -? MachineBasicBlock::iterator SpliceEnd;
> >> -? //look for the input branchinstr, not the AMDGPU branchinstr
> >> -? MachineInstr *BranchMI = getNormalBlockBranchInstr(SrcMBB);
> >> -? if (!BranchMI) {
> >> -? ? DEBUG(
> >> -? ? ? dbgs() << "migrateInstruction don't see branch
> > instr\n" ;
> >> -? ? );
> >> -? ? SpliceEnd = SrcMBB->end();
> >> -? } else {
> >> -? ? DEBUG(
> >> -? ? ? dbgs() << "migrateInstruction see branch
> > instr\n" ;
> >> -? ? ? BranchMI->dump();
> >> -? ? );
> >> -? ? SpliceEnd = BranchMI;
> >> -? }
> >> -? DEBUG(
> >> -? ? dbgs() << "migrateInstruction before splice dstSize =
> > " << DstMBB->size()
> >> -? ? ? << "srcSize = " << SrcMBB->size() <<
> > "\n";
> >> -? );
> >> -
> >> -? //splice insert before insertPos
> >> -? DstMBB->splice(I, SrcMBB, SrcMBB->begin(), SpliceEnd);
> >> -
> >> -? DEBUG(
> >> -? ? dbgs() << "migrateInstruction after splice dstSize = "
> > << DstMBB->size()
> >> -? ? ? << "srcSize = " << SrcMBB->size() <<
> > "\n";
> >> -? );
> >> -}
> >> -
> >> ? MachineBasicBlock *
> >> ? AMDGPUCFGStructurizer::normalizeInfiniteLoopExit(MachineLoop* LoopRep)
> > {
> >> ? ? MachineBasicBlock *LoopHeader = LoopRep->getHeader();
> >> @@ -1839,60 +1406,6 @@
> >> ? ? );
> >> ? }
> >> ?
> >> -MachineBasicBlock *
> >> -AMDGPUCFGStructurizer::findNearestCommonPostDom(MachineBasicBlock *MBB1,
> >> -? ? MachineBasicBlock *MBB2) {
> >> -
> >> -? if (PDT->dominates(MBB1, MBB2))
> >> -? ? return MBB1;
> >> -? if (PDT->dominates(MBB2, MBB1))
> >> -? ? return MBB2;
> >> -
> >> -? MachineDomTreeNode *Node1 = PDT->getNode(MBB1);
> >> -? MachineDomTreeNode *Node2 = PDT->getNode(MBB2);
> >> -
> >> -? // Handle newly cloned node.
> >> -? if (!Node1 && MBB1->succ_size() == 1)
> >> -? ? return findNearestCommonPostDom(*MBB1->succ_begin(), MBB2);
> >> -? if (!Node2 && MBB2->succ_size() == 1)
> >> -? ? return findNearestCommonPostDom(MBB1, *MBB2->succ_begin());
> >> -
> >> -? if (!Node1 || !Node2)
> >> -? ? return NULL;
> >> -
> >> -? Node1 = Node1->getIDom();
> >> -? while (Node1) {
> >> -? ? if (PDT->dominates(Node1, Node2))
> >> -? ? ? return Node1->getBlock();
> >> -? ? Node1 = Node1->getIDom();
> >> -? }
> >> -
> >> -? return NULL;
> >> -}
> >> -
> >> -MachineBasicBlock *
> >> -AMDGPUCFGStructurizer::findNearestCommonPostDom(
> >> -? ? std::set<MachineBasicBlock *> &MBBs) {
> >> -? MachineBasicBlock *CommonDom;
> >> -? std::set<MachineBasicBlock *>::const_iterator It = MBBs.begin();
> >> -? std::set<MachineBasicBlock *>::const_iterator E = MBBs.end();
> >> -? for (CommonDom = *It; It != E && CommonDom; ++It) {
> >> -? ? MachineBasicBlock *MBB = *It;
> >> -? ? if (MBB != CommonDom)
> >> -? ? ? CommonDom = findNearestCommonPostDom(MBB, CommonDom);
> >> -? }
> >> -
> >> -? DEBUG(
> >> -? ? dbgs() << "Common post dominator for exit blocks is ";
> >> -? ? if (CommonDom)
> >> -? ? ? ? ? dbgs() << "BB" <<
> > CommonDom->getNumber() << "\n";
> >> -? ? else
> >> -? ? ? dbgs() << "NULL\n";
> >> -? );
> >> -
> >> -? return CommonDom;
> >> -}
> >> -
> >> ? char AMDGPUCFGStructurizer::ID = 0;
> >> ?
> >> ? } // end anonymous namespace
> >
> >> _______________________________________________
> >> llvm-commits mailing list
> >> llvm-commits at cs.uiuc.edu
> >> http://lists.cs.uiuc.edu/mailman/listinfo/llvm-commits
> >
-------------- next part --------------
; ModuleID = 'radeon'
target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-v16:16:16-v24:32:32-v32:32:32-v48:64:64-v64:64:64-v96:128:128-v128:128:128-v192:256:256-v256:256:256-v512:512:512-v1024:1024:1024-v2048:2048:2048-n32:64"
target triple = "r600--"
; Function Attrs: nounwind
define void @resizeLN_C1_D0(i8 addrspace(1)* nocapture %dst, i8 addrspace(1)* noalias nocapture readonly %src, i32 %dstoffset_in_pixel, i32 %srcoffset_in_pixel, i32 %dststep_in_pixel, i32 %srcstep_in_pixel, i32 %src_cols, i32 %src_rows, i32 %dst_cols, i32 %dst_rows, float %ifx, float %ify) #0 {
entry:
%x.i.i = tail call i32 @llvm.r600.read.tgid.x() #1
%x.i12.i = tail call i32 @llvm.r600.read.local.size.x() #1
%mul26.i = mul i32 %x.i12.i, %x.i.i
%x.i4.i = tail call i32 @llvm.r600.read.tidig.x() #1
%add.i = add i32 %x.i4.i, %mul26.i
%y.i.i = tail call i32 @llvm.r600.read.tgid.y() #1
%y.i14.i = tail call i32 @llvm.r600.read.local.size.y() #1
%mul30.i = mul i32 %y.i14.i, %y.i.i
%y.i6.i = tail call i32 @llvm.r600.read.tidig.y() #1
%add.i454 = add i32 %y.i6.i, %mul30.i
%shl = shl i32 %add.i, 2
%and = and i32 %dstoffset_in_pixel, 3
%sub = sub nsw i32 %shl, %and
%add = add nsw i32 %sub, 1
%add3 = add nsw i32 %sub, 2
%add5 = add nsw i32 %sub, 3
%conv.i.i.i449 = sitofp i32 %sub to float
%0 = insertelement <4 x float> undef, float %conv.i.i.i449, i32 0
%conv.i4.i.i450 = sitofp i32 %add to float
%1 = insertelement <4 x float> %0, float %conv.i4.i.i450, i32 1
%conv.i.i5.i451 = sitofp i32 %add3 to float
%2 = insertelement <4 x float> undef, float %conv.i.i5.i451, i32 0
%conv.i4.i7.i452 = sitofp i32 %add5 to float
%3 = insertelement <4 x float> %2, float %conv.i4.i7.i452, i32 1
%vecinit3.i453 = shufflevector <4 x float> %1, <4 x float> %3, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%add8 = fadd <4 x float> %vecinit3.i453, <float 5.000000e-01, float 5.000000e-01, float 5.000000e-01, float 5.000000e-01>
%splat.splatinsert = insertelement <4 x float> undef, float %ifx, i32 0
%splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
%4 = tail call <4 x float> @llvm.fmuladd.v4f32(<4 x float> %add8, <4 x float> %splat.splat, <4 x float> <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>)
%call9 = tail call <4 x float> @llvm.floor.v4f32(<4 x float> %4)
%5 = extractelement <4 x float> %call9, i32 0
%conv.i.i.i444 = fptosi float %5 to i32
%6 = insertelement <4 x i32> undef, i32 %conv.i.i.i444, i32 0
%7 = extractelement <4 x float> %call9, i32 1
%conv.i4.i.i445 = fptosi float %7 to i32
%8 = insertelement <4 x i32> %6, i32 %conv.i4.i.i445, i32 1
%9 = extractelement <4 x float> %call9, i32 2
%conv.i.i5.i446 = fptosi float %9 to i32
%10 = insertelement <4 x i32> undef, i32 %conv.i.i5.i446, i32 0
%11 = extractelement <4 x float> %call9, i32 3
%conv.i4.i7.i447 = fptosi float %11 to i32
%12 = insertelement <4 x i32> %10, i32 %conv.i4.i7.i447, i32 1
%vecinit3.i448 = shufflevector <4 x i32> %8, <4 x i32> %12, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%sub11 = fsub <4 x float> %4, %call9
%conv = sitofp i32 %add.i454 to float
%add12 = fadd float %conv, 5.000000e-01
%13 = tail call float @llvm.fmuladd.f32(float %add12, float %ify, float -5.000000e-01)
%call13 = tail call float @llvm.floor.f32(float %13)
%conv14 = fptosi float %call13 to i32
%call10.lobit = ashr <4 x i32> %vecinit3.i448, <i32 31, i32 31, i32 31, i32 31>
%14 = xor <4 x i32> %call10.lobit, <i32 -1, i32 -1, i32 -1, i32 -1>
%15 = bitcast <4 x float> %sub11 to <4 x i32>
%16 = and <4 x i32> %15, %14
%splat.splatinsert18 = insertelement <4 x i32> undef, i32 %src_cols, i32 0
%splat.splat19 = shufflevector <4 x i32> %splat.splatinsert18, <4 x i32> undef, <4 x i32> zeroinitializer
%cmp20 = icmp sge <4 x i32> %vecinit3.i448, %splat.splat19
%17 = bitcast <4 x i32> %16 to <4 x float>
%18 = and <4 x i32> %vecinit3.i448, %14
%cmp30 = icmp sge <4 x i32> %18, %splat.splat19
%sub32 = add nsw i32 %src_cols, -1
%splat.splatinsert33 = insertelement <4 x i32> undef, i32 %sub32, i32 0
%splat.splat34 = shufflevector <4 x i32> %splat.splatinsert33, <4 x i32> undef, <4 x i32> zeroinitializer
%cond36 = select <4 x i1> %cmp30, <4 x i32> %splat.splat34, <4 x i32> %18
%cmp37 = icmp slt i32 %conv14, 0
br i1 %cmp37, label %cond.end, label %cond.false
cond.false: ; preds = %entry
%conv15 = sitofp i32 %conv14 to float
%sub16 = fsub float %13, %conv15
%phitmp = fmul float %sub16, 2.048000e+03
br label %cond.end
cond.end: ; preds = %cond.false, %entry
%v.0 = phi float [ %phitmp, %cond.false ], [ 0.000000e+00, %entry ]
%y.0 = phi i32 [ %conv14, %cond.false ], [ 0, %entry ]
%cmp41 = icmp slt i32 %y.0, %src_rows
%sub44 = add nsw i32 %src_rows, -1
%y.1 = select i1 %cmp41, i32 %y.0, i32 %sub44
%.op = fmul <4 x float> %17, <float 2.048000e+03, float 2.048000e+03, float 2.048000e+03, float 2.048000e+03>
%mul = select <4 x i1> %cmp20, <4 x float> zeroinitializer, <4 x float> %.op
%sub49 = fsub <4 x float> <float 2.048000e+03, float 2.048000e+03, float 2.048000e+03, float 2.048000e+03>, %mul
%call50 = tail call <4 x float> @llvm.rint.v4f32(<4 x float> %mul)
%19 = extractelement <4 x float> %call50, i32 0
%conv.i.i.i439 = fptosi float %19 to i32
%20 = insertelement <4 x i32> undef, i32 %conv.i.i.i439, i32 0
%21 = extractelement <4 x float> %call50, i32 1
%conv.i4.i.i440 = fptosi float %21 to i32
%22 = insertelement <4 x i32> %20, i32 %conv.i4.i.i440, i32 1
%23 = extractelement <4 x float> %call50, i32 2
%conv.i.i5.i441 = fptosi float %23 to i32
%24 = insertelement <4 x i32> undef, i32 %conv.i.i5.i441, i32 0
%25 = extractelement <4 x float> %call50, i32 3
%conv.i4.i7.i442 = fptosi float %25 to i32
%26 = insertelement <4 x i32> %24, i32 %conv.i4.i7.i442, i32 1
%vecinit3.i443 = shufflevector <4 x i32> %22, <4 x i32> %26, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%call52 = tail call <4 x float> @llvm.rint.v4f32(<4 x float> %sub49)
%27 = extractelement <4 x float> %call52, i32 0
%conv.i.i.i = fptosi float %27 to i32
%28 = insertelement <4 x i32> undef, i32 %conv.i.i.i, i32 0
%29 = extractelement <4 x float> %call52, i32 1
%conv.i4.i.i = fptosi float %29 to i32
%30 = insertelement <4 x i32> %28, i32 %conv.i4.i.i, i32 1
%31 = extractelement <4 x float> %call52, i32 2
%conv.i.i5.i = fptosi float %31 to i32
%32 = insertelement <4 x i32> undef, i32 %conv.i.i5.i, i32 0
%33 = extractelement <4 x float> %call52, i32 3
%conv.i4.i7.i = fptosi float %33 to i32
%34 = insertelement <4 x i32> %32, i32 %conv.i4.i7.i, i32 1
%vecinit3.i = shufflevector <4 x i32> %30, <4 x i32> %34, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%mul54 = select i1 %cmp41, float %v.0, float 0.000000e+00
%call55 = tail call float @llvm.rint.f32(float %mul54)
%conv56 = fptosi float %call55 to i32
%sub57 = fsub float 2.048000e+03, %mul54
%call58 = tail call float @llvm.rint.f32(float %sub57)
%conv59 = fptosi float %call58 to i32
%add60 = add nsw i32 %y.1, 1
%cmp61 = icmp sge i32 %add60, %src_rows
%cond67 = select i1 %cmp61, i32 %y.1, i32 %add60
%add68 = add <4 x i32> %cond36, <i32 1, i32 1, i32 1, i32 1>
%cmp71 = icmp sge <4 x i32> %add68, %splat.splat19
%sext72 = sext <4 x i1> %cmp71 to <4 x i32>
%cmp73 = icmp ne <4 x i32> %sext72, zeroinitializer
%cond77 = select <4 x i1> %cmp73, <4 x i32> %cond36, <4 x i32> %add68
%splat.splatinsert78 = insertelement <4 x i32> undef, i32 %y.1, i32 0
%splat.splat79 = shufflevector <4 x i32> %splat.splatinsert78, <4 x i32> undef, <4 x i32> zeroinitializer
%splat.splatinsert80 = insertelement <4 x i32> undef, i32 %srcstep_in_pixel, i32 0
%splat.splat81 = shufflevector <4 x i32> %splat.splatinsert80, <4 x i32> undef, <4 x i32> zeroinitializer
%splat.splatinsert82 = insertelement <4 x i32> undef, i32 %srcoffset_in_pixel, i32 0
%splat.splat83 = shufflevector <4 x i32> %splat.splatinsert82, <4 x i32> undef, <4 x i32> zeroinitializer
%add84 = add <4 x i32> %cond36, %splat.splat83
%shl.i.i433 = shl <4 x i32> %splat.splat79, <i32 8, i32 8, i32 8, i32 8>
%shr.i.i434 = ashr <4 x i32> %shl.i.i433, <i32 8, i32 8, i32 8, i32 8>
%shl1.i.i435 = shl <4 x i32> %splat.splat81, <i32 8, i32 8, i32 8, i32 8>
%shr2.i.i436 = ashr <4 x i32> %shl1.i.i435, <i32 8, i32 8, i32 8, i32 8>
%mul.i.i437 = mul <4 x i32> %shr.i.i434, %shr2.i.i436
%add.i438 = add <4 x i32> %mul.i.i437, %add84
%add92 = add <4 x i32> %cond77, %splat.splat83
%add.i432 = add <4 x i32> %mul.i.i437, %add92
%splat.splatinsert94 = insertelement <4 x i32> undef, i32 %cond67, i32 0
%splat.splat95 = shufflevector <4 x i32> %splat.splatinsert94, <4 x i32> undef, <4 x i32> zeroinitializer
%shl.i.i421 = shl <4 x i32> %splat.splat95, <i32 8, i32 8, i32 8, i32 8>
%shr.i.i422 = ashr <4 x i32> %shl.i.i421, <i32 8, i32 8, i32 8, i32 8>
%mul.i.i425 = mul <4 x i32> %shr.i.i422, %shr2.i.i436
%add.i426 = add <4 x i32> %mul.i.i425, %add84
%add.i420 = add <4 x i32> %mul.i.i425, %add92
%35 = extractelement <4 x i32> %add.i438, i32 0
%arrayidx = getelementptr inbounds i8 addrspace(1)* %src, i32 %35
%36 = load i8 addrspace(1)* %arrayidx, align 1, !tbaa !10
%conv110 = zext i8 %36 to i32
%37 = insertelement <4 x i32> undef, i32 %conv110, i32 0
%38 = extractelement <4 x i32> %add.i438, i32 1
%arrayidx111 = getelementptr inbounds i8 addrspace(1)* %src, i32 %38
%39 = load i8 addrspace(1)* %arrayidx111, align 1, !tbaa !10
%conv112 = zext i8 %39 to i32
%40 = insertelement <4 x i32> %37, i32 %conv112, i32 1
%41 = extractelement <4 x i32> %add.i438, i32 2
%arrayidx113 = getelementptr inbounds i8 addrspace(1)* %src, i32 %41
%42 = load i8 addrspace(1)* %arrayidx113, align 1, !tbaa !10
%conv114 = zext i8 %42 to i32
%43 = insertelement <4 x i32> %40, i32 %conv114, i32 2
%44 = extractelement <4 x i32> %add.i438, i32 3
%arrayidx115 = getelementptr inbounds i8 addrspace(1)* %src, i32 %44
%45 = load i8 addrspace(1)* %arrayidx115, align 1, !tbaa !10
%conv116 = zext i8 %45 to i32
%46 = insertelement <4 x i32> %43, i32 %conv116, i32 3
%47 = extractelement <4 x i32> %add.i432, i32 0
%arrayidx117 = getelementptr inbounds i8 addrspace(1)* %src, i32 %47
%48 = load i8 addrspace(1)* %arrayidx117, align 1, !tbaa !10
%conv118 = zext i8 %48 to i32
%49 = insertelement <4 x i32> undef, i32 %conv118, i32 0
%50 = extractelement <4 x i32> %add.i432, i32 1
%arrayidx119 = getelementptr inbounds i8 addrspace(1)* %src, i32 %50
%51 = load i8 addrspace(1)* %arrayidx119, align 1, !tbaa !10
%conv120 = zext i8 %51 to i32
%52 = insertelement <4 x i32> %49, i32 %conv120, i32 1
%53 = extractelement <4 x i32> %add.i432, i32 2
%arrayidx121 = getelementptr inbounds i8 addrspace(1)* %src, i32 %53
%54 = load i8 addrspace(1)* %arrayidx121, align 1, !tbaa !10
%conv122 = zext i8 %54 to i32
%55 = insertelement <4 x i32> %52, i32 %conv122, i32 2
%56 = extractelement <4 x i32> %add.i432, i32 3
%arrayidx123 = getelementptr inbounds i8 addrspace(1)* %src, i32 %56
%57 = load i8 addrspace(1)* %arrayidx123, align 1, !tbaa !10
%conv124 = zext i8 %57 to i32
%58 = insertelement <4 x i32> %55, i32 %conv124, i32 3
%59 = extractelement <4 x i32> %add.i426, i32 0
%arrayidx125 = getelementptr inbounds i8 addrspace(1)* %src, i32 %59
%60 = load i8 addrspace(1)* %arrayidx125, align 1, !tbaa !10
%conv126 = zext i8 %60 to i32
%61 = insertelement <4 x i32> undef, i32 %conv126, i32 0
%62 = extractelement <4 x i32> %add.i426, i32 1
%arrayidx127 = getelementptr inbounds i8 addrspace(1)* %src, i32 %62
%63 = load i8 addrspace(1)* %arrayidx127, align 1, !tbaa !10
%conv128 = zext i8 %63 to i32
%64 = insertelement <4 x i32> %61, i32 %conv128, i32 1
%65 = extractelement <4 x i32> %add.i426, i32 2
%arrayidx129 = getelementptr inbounds i8 addrspace(1)* %src, i32 %65
%66 = load i8 addrspace(1)* %arrayidx129, align 1, !tbaa !10
%conv130 = zext i8 %66 to i32
%67 = insertelement <4 x i32> %64, i32 %conv130, i32 2
%68 = extractelement <4 x i32> %add.i426, i32 3
%arrayidx131 = getelementptr inbounds i8 addrspace(1)* %src, i32 %68
%69 = load i8 addrspace(1)* %arrayidx131, align 1, !tbaa !10
%conv132 = zext i8 %69 to i32
%70 = insertelement <4 x i32> %67, i32 %conv132, i32 3
%71 = extractelement <4 x i32> %add.i420, i32 0
%arrayidx133 = getelementptr inbounds i8 addrspace(1)* %src, i32 %71
%72 = load i8 addrspace(1)* %arrayidx133, align 1, !tbaa !10
%conv134 = zext i8 %72 to i32
%73 = insertelement <4 x i32> undef, i32 %conv134, i32 0
%74 = extractelement <4 x i32> %add.i420, i32 1
%arrayidx135 = getelementptr inbounds i8 addrspace(1)* %src, i32 %74
%75 = load i8 addrspace(1)* %arrayidx135, align 1, !tbaa !10
%conv136 = zext i8 %75 to i32
%76 = insertelement <4 x i32> %73, i32 %conv136, i32 1
%77 = extractelement <4 x i32> %add.i420, i32 2
%arrayidx137 = getelementptr inbounds i8 addrspace(1)* %src, i32 %77
%78 = load i8 addrspace(1)* %arrayidx137, align 1, !tbaa !10
%conv138 = zext i8 %78 to i32
%79 = insertelement <4 x i32> %76, i32 %conv138, i32 2
%80 = extractelement <4 x i32> %add.i420, i32 3
%arrayidx139 = getelementptr inbounds i8 addrspace(1)* %src, i32 %80
%81 = load i8 addrspace(1)* %arrayidx139, align 1, !tbaa !10
%conv140 = zext i8 %81 to i32
%82 = insertelement <4 x i32> %79, i32 %conv140, i32 3
%shl.i410 = shl <4 x i32> %vecinit3.i, <i32 8, i32 8, i32 8, i32 8>
%shr.i411 = ashr <4 x i32> %shl.i410, <i32 8, i32 8, i32 8, i32 8>
%shl1.i412 = shl <4 x i32> %46, <i32 8, i32 8, i32 8, i32 8>
%shr2.i413 = ashr <4 x i32> %shl1.i412, <i32 8, i32 8, i32 8, i32 8>
%mul.i414 = mul <4 x i32> %shr.i411, %shr2.i413
%shl.i405 = shl <4 x i32> %vecinit3.i443, <i32 8, i32 8, i32 8, i32 8>
%shr.i406 = ashr <4 x i32> %shl.i405, <i32 8, i32 8, i32 8, i32 8>
%shl1.i407 = shl <4 x i32> %58, <i32 8, i32 8, i32 8, i32 8>
%shr2.i408 = ashr <4 x i32> %shl1.i407, <i32 8, i32 8, i32 8, i32 8>
%mul.i409 = mul <4 x i32> %shr.i406, %shr2.i408
%add143 = add <4 x i32> %mul.i414, %mul.i409
%shl1.i402 = shl <4 x i32> %70, <i32 8, i32 8, i32 8, i32 8>
%shr2.i403 = ashr <4 x i32> %shl1.i402, <i32 8, i32 8, i32 8, i32 8>
%mul.i404 = mul <4 x i32> %shr.i411, %shr2.i403
%shl1.i397 = shl <4 x i32> %82, <i32 8, i32 8, i32 8, i32 8>
%shr2.i398 = ashr <4 x i32> %shl1.i397, <i32 8, i32 8, i32 8, i32 8>
%mul.i399 = mul <4 x i32> %shr.i406, %shr2.i398
%add146 = add <4 x i32> %mul.i404, %mul.i399
%splat.splatinsert147 = insertelement <4 x i32> undef, i32 %conv59, i32 0
%splat.splat148 = shufflevector <4 x i32> %splat.splatinsert147, <4 x i32> undef, <4 x i32> zeroinitializer
%shl.i390 = shl <4 x i32> %splat.splat148, <i32 8, i32 8, i32 8, i32 8>
%shr.i391 = ashr <4 x i32> %shl.i390, <i32 8, i32 8, i32 8, i32 8>
%shl1.i392 = shl <4 x i32> %add143, <i32 8, i32 8, i32 8, i32 8>
%shr2.i393 = ashr <4 x i32> %shl1.i392, <i32 8, i32 8, i32 8, i32 8>
%mul.i394 = mul <4 x i32> %shr.i391, %shr2.i393
%splat.splatinsert150 = insertelement <4 x i32> undef, i32 %conv56, i32 0
%splat.splat151 = shufflevector <4 x i32> %splat.splatinsert150, <4 x i32> undef, <4 x i32> zeroinitializer
%shl.i = shl <4 x i32> %splat.splat151, <i32 8, i32 8, i32 8, i32 8>
%shr.i = ashr <4 x i32> %shl.i, <i32 8, i32 8, i32 8, i32 8>
%shl1.i = shl <4 x i32> %add146, <i32 8, i32 8, i32 8, i32 8>
%shr2.i = ashr <4 x i32> %shl1.i, <i32 8, i32 8, i32 8, i32 8>
%mul.i = mul <4 x i32> %shr.i, %shr2.i
%add153 = add <4 x i32> %mul.i394, %mul.i
%add154 = add <4 x i32> %add153, <i32 2097152, i32 2097152, i32 2097152, i32 2097152>
%shr = ashr <4 x i32> %add154, <i32 22, i32 22, i32 22, i32 22>
%add155 = add nsw i32 %sub, %dstoffset_in_pixel
%shl.i.i = shl i32 %add.i454, 8
%shr.i.i = ashr exact i32 %shl.i.i, 8
%shl1.i.i = shl i32 %dststep_in_pixel, 8
%shr2.i.i = ashr exact i32 %shl1.i.i, 8
%mul.i.i = mul nsw i32 %shr.i.i, %shr2.i.i
%add.i389 = add nsw i32 %add155, %mul.i.i
%inc = add nsw i32 %add.i389, 1
%add159 = add nsw i32 %add.i389, 2
%add160 = add nsw i32 %add.i389, 3
%cmp.i.i = icmp sgt <4 x i32> %shr, <i32 255, i32 255, i32 255, i32 255>
%cmp5.i.i = icmp slt <4 x i32> %shr, zeroinitializer
%cond.i.i = select <4 x i1> %cmp5.i.i, <4 x i32> zeroinitializer, <4 x i32> %shr
%cond11.i.i = select <4 x i1> %cmp.i.i, <4 x i32> <i32 255, i32 255, i32 255, i32 255>, <4 x i32> %cond.i.i
%83 = extractelement <4 x i32> %cond11.i.i, i32 0
%conv.i.i.i.i = trunc i32 %83 to i8
%84 = insertelement <4 x i8> undef, i8 %conv.i.i.i.i, i32 0
%85 = extractelement <4 x i32> %cond11.i.i, i32 1
%conv.i4.i.i.i = trunc i32 %85 to i8
%86 = insertelement <4 x i8> %84, i8 %conv.i4.i.i.i, i32 1
%87 = extractelement <4 x i32> %cond11.i.i, i32 2
%conv.i.i5.i.i = trunc i32 %87 to i8
%88 = insertelement <4 x i8> undef, i8 %conv.i.i5.i.i, i32 0
%89 = extractelement <4 x i32> %cond11.i.i, i32 3
%conv.i4.i7.i.i = trunc i32 %89 to i8
%90 = insertelement <4 x i8> %88, i8 %conv.i4.i7.i.i, i32 1
%vecinit3.i.i = shufflevector <4 x i8> %86, <4 x i8> %90, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
%cmp162 = icmp sgt i32 %sub, -1
br i1 %cmp162, label %land.lhs.true, label %if.end
land.lhs.true: ; preds = %cond.end
%cmp165 = icmp slt i32 %add5, %dst_cols
%cmp168 = icmp sgt i32 %add.i454, -1
%or.cond = and i1 %cmp165, %cmp168
%cmp171 = icmp slt i32 %add.i454, %dst_rows
%or.cond382 = and i1 %or.cond, %cmp171
%cmp174 = icmp eq i32 %and, 0
%or.cond388 = and i1 %or.cond382, %cmp174
br i1 %or.cond388, label %if.then, label %land.lhs.true178
if.then: ; preds = %land.lhs.true
%add.ptr = getelementptr inbounds i8 addrspace(1)* %dst, i32 %add.i389
%91 = bitcast i8 addrspace(1)* %add.ptr to <4 x i8> addrspace(1)*
store <4 x i8> %vecinit3.i.i, <4 x i8> addrspace(1)* %91, align 4, !tbaa !10
br label %if.end237
land.lhs.true178: ; preds = %land.lhs.true
%cmp179 = icmp slt i32 %sub, %dst_cols
%or.cond238 = and i1 %cmp179, %cmp168
%or.cond384 = and i1 %or.cond238, %cmp171
br i1 %or.cond384, label %if.then187, label %if.end
if.then187: ; preds = %land.lhs.true178
%arrayidx188 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %add.i389
store i8 %conv.i.i.i.i, i8 addrspace(1)* %arrayidx188, align 1, !tbaa !10
br label %if.end
if.end: ; preds = %if.then187, %land.lhs.true178, %cond.end
%cmp190 = icmp sgt i32 %sub, -2
br i1 %cmp190, label %land.lhs.true192, label %if.end204
land.lhs.true192: ; preds = %if.end
%cmp194 = icmp slt i32 %add, %dst_cols
%cmp197 = icmp sgt i32 %add.i454, -1
%or.cond239 = and i1 %cmp194, %cmp197
%cmp200 = icmp slt i32 %add.i454, %dst_rows
%or.cond385 = and i1 %or.cond239, %cmp200
br i1 %or.cond385, label %if.then202, label %if.end204
if.then202: ; preds = %land.lhs.true192
%arrayidx203 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %inc
store i8 %conv.i4.i.i.i, i8 addrspace(1)* %arrayidx203, align 1, !tbaa !10
br label %if.end204
if.end204: ; preds = %if.then202, %land.lhs.true192, %if.end
%cmp206 = icmp sgt i32 %add3, -1
br i1 %cmp206, label %land.lhs.true208, label %if.end220
land.lhs.true208: ; preds = %if.end204
%cmp210 = icmp slt i32 %add3, %dst_cols
%cmp213 = icmp sgt i32 %add.i454, -1
%or.cond240 = and i1 %cmp210, %cmp213
%cmp216 = icmp slt i32 %add.i454, %dst_rows
%or.cond386 = and i1 %or.cond240, %cmp216
br i1 %or.cond386, label %if.then218, label %if.end220
if.then218: ; preds = %land.lhs.true208
%arrayidx219 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %add159
store i8 %conv.i.i5.i.i, i8 addrspace(1)* %arrayidx219, align 1, !tbaa !10
br label %if.end220
if.end220: ; preds = %if.then218, %land.lhs.true208, %if.end204
%cmp222 = icmp sgt i32 %add5, -1
br i1 %cmp222, label %land.lhs.true224, label %if.end237
land.lhs.true224: ; preds = %if.end220
%cmp226 = icmp slt i32 %add5, %dst_cols
%cmp229 = icmp sgt i32 %add.i454, -1
%or.cond241 = and i1 %cmp226, %cmp229
%cmp232 = icmp slt i32 %add.i454, %dst_rows
%or.cond387 = and i1 %or.cond241, %cmp232
br i1 %or.cond387, label %if.then234, label %if.end237
if.then234: ; preds = %land.lhs.true224
%arrayidx235 = getelementptr inbounds i8 addrspace(1)* %dst, i32 %add160
store i8 %conv.i4.i7.i.i, i8 addrspace(1)* %arrayidx235, align 1, !tbaa !10
br label %if.end237
if.end237: ; preds = %if.then234, %land.lhs.true224, %if.end220, %if.then
ret void
}
; Function Attrs: nounwind readnone
declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>) #1
; Function Attrs: nounwind readonly
declare <4 x float> @llvm.floor.v4f32(<4 x float>) #2
; Function Attrs: nounwind readnone
declare float @llvm.fmuladd.f32(float, float, float) #1
; Function Attrs: nounwind readonly
declare float @llvm.floor.f32(float) #2
; Function Attrs: nounwind readonly
declare <4 x float> @llvm.rint.v4f32(<4 x float>) #2
; Function Attrs: nounwind readonly
declare float @llvm.rint.f32(float) #2
; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.tgid.x() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.tgid.y() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.local.size.x() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.local.size.y() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.tidig.x() #1
; Function Attrs: nounwind readnone
declare i32 @llvm.r600.read.tidig.y() #1
attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
attributes #1 = { nounwind readnone }
attributes #2 = { nounwind readonly }
!opencl.kernels = !{!0, !1, !2, !3, !4, !5, !6, !7}
!llvm.ident = !{!8, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9, !9}
!0 = metadata !{void (i8 addrspace(1)*, i8 addrspace(1)*, i32, i32, i32, i32, i32, i32, i32, i32, float, float)* @resizeLN_C1_D0}
!1 = metadata !{null}
!2 = metadata !{null}
!3 = metadata !{null}
!4 = metadata !{null}
!5 = metadata !{null}
!6 = metadata !{null}
!7 = metadata !{null}
!8 = metadata !{metadata !"clang version 3.4 (trunk 194633) (llvm/trunk 195029)"}
!9 = metadata !{metadata !"clang version 3.4 (trunk 194830) (llvm/trunk 194831)"}
!10 = metadata !{metadata !11, metadata !11, i64 0}
!11 = metadata !{metadata !"omnipotent char", metadata !12, i64 0}
!12 = metadata !{metadata !"Simple C/C++ TBAA"}
More information about the llvm-commits
mailing list