[llvm] r303111 - Re-submit AMDGPUMachineCFGStructurizer.

Jan Sjodin via llvm-commits llvm-commits at lists.llvm.org
Mon May 15 13:18:37 PDT 2017


Author: jsjodin
Date: Mon May 15 15:18:37 2017
New Revision: 303111

URL: http://llvm.org/viewvc/llvm-project?rev=303111&view=rev
Log:
Re-submit AMDGPUMachineCFGStructurizer.

Differential Revision: https://reviews.llvm.org/D23209


Added:
    llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
      - copied, changed from r303097, llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
    llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
    llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
    llvm/trunk/lib/Target/AMDGPU/SIInstructions.td

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPU.h?rev=303111&r1=303110&r2=303111&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h Mon May 15 15:18:37 2017
@@ -50,6 +50,10 @@ FunctionPass *createSIDebuggerInsertNops
 FunctionPass *createSIInsertWaitsPass();
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
+FunctionPass *createAMDGPUMachineCFGStructurizerPass();
+
+void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);
+extern char &AMDGPUMachineCFGStructurizerID;
 
 ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr);
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);

Copied: llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp (from r303097, llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp)
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp?p2=llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp&p1=llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp&r1=303097&r2=303111&rev=303111&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp Mon May 15 15:18:37 2017
@@ -1,4 +1,4 @@
-//===-- AMDGPUMachineCFGStructurizer.cpp - Machine code if conversion pass. -----===//
+//===- AMDGPUMachineCFGStructurizer.cpp - Machine code if conversion pass. ===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -75,7 +75,6 @@ public:
   void addDest(unsigned DestReg, const DebugLoc &DL);
   void replaceDef(unsigned OldDestReg, unsigned NewDestReg);
   void deleteDef(unsigned DestReg);
-  DebugLoc getDebugLoc(unsigned DestReg);
   void addSource(unsigned DestReg, unsigned SourceReg,
                  MachineBasicBlock *SourceMBB);
   void removeSource(unsigned DestReg, unsigned SourceReg,
@@ -224,10 +223,6 @@ void PHILinearize::deleteDef(unsigned De
   delete InfoElement;
 }
 
-DebugLoc PHILinearize::getDebugLoc(unsigned DestReg) {
-  return phiInfoElementGetDebugLoc(findPHIInfoElement(DestReg));
-}
-
 void PHILinearize::addSource(unsigned DestReg, unsigned SourceReg,
                              MachineBasicBlock *SourceMBB) {
   phiInfoElementAddSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB);
@@ -260,18 +255,18 @@ unsigned PHILinearize::getNumSources(uns
 
 void PHILinearize::dump(MachineRegisterInfo *MRI) {
   const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
-  DEBUG(dbgs() << "=PHIInfo Start=\n");
+  dbgs() << "=PHIInfo Start=\n";
   for (auto PII : this->PHIInfo) {
     PHIInfoElementT &Element = *PII;
-    DEBUG(dbgs() << "Dest: " << PrintReg(Element.DestReg, TRI)
-                 << " Sources: {");
+    dbgs() << "Dest: " << PrintReg(Element.DestReg, TRI)
+           << " Sources: {";
     for (auto &SI : Element.Sources) {
-      DEBUG(dbgs() << PrintReg(SI.first, TRI) << "(BB#"
-                   << SI.second->getNumber() << "),");
+      dbgs() << PrintReg(SI.first, TRI) << "(BB#"
+             << SI.second->getNumber() << "),";
     }
-    DEBUG(dbgs() << "}\n");
+    dbgs() << "}\n";
   }
-  DEBUG(dbgs() << "=PHIInfo End=\n");
+  dbgs() << "=PHIInfo End=\n";
 }
 
 void PHILinearize::clear() { PHIInfo = PHIInfoT(); }
@@ -379,8 +374,6 @@ public:
 
   void addLiveOut(unsigned VReg);
 
-  void addLiveOuts(LinearizedRegion *LRegion);
-
   void removeLiveOut(unsigned Reg);
 
   void replaceLiveOut(unsigned OldReg, unsigned NewReg);
@@ -417,8 +410,6 @@ public:
 
   bool hasNoDef(unsigned Reg, MachineRegisterInfo *MRI);
 
-  bool isDefinedInRegion(unsigned Reg, MachineRegisterInfo *MRI);
-
   void removeFalseRegisterKills(MachineRegisterInfo *MRI);
 
   void initLiveOut(RegionMRT *Region, const MachineRegisterInfo *MRI,
@@ -427,9 +418,6 @@ public:
   LinearizedRegion(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI,
                    const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
 
-  LinearizedRegion(RegionMRT *Region, const MachineRegisterInfo *MRI,
-                   const TargetRegisterInfo *TRI, PHILinearize &PHIInfo);
-
   LinearizedRegion();
 
   ~LinearizedRegion();
@@ -889,13 +877,6 @@ bool LinearizedRegion::getHasLoop() { re
 
 void LinearizedRegion::addLiveOut(unsigned VReg) { LiveOuts.insert(VReg); }
 
-void LinearizedRegion::addLiveOuts(LinearizedRegion *LRegion) {
-  DenseSet<unsigned> *RegionLiveOuts = LRegion->getLiveOuts();
-  for (auto R : *RegionLiveOuts) {
-    addLiveOut(R);
-  }
-}
-
 void LinearizedRegion::removeLiveOut(unsigned Reg) {
   if (isLiveOut(Reg))
     LiveOuts.erase(Reg);
@@ -1013,23 +994,6 @@ bool LinearizedRegion::hasNoDef(unsigned
   return MRI->def_begin(Reg) == MRI->def_end();
 }
 
-bool LinearizedRegion::isDefinedInRegion(unsigned Reg,
-                                         MachineRegisterInfo *MRI) {
-  bool NoDef = hasNoDef(Reg, MRI);
-  if (NoDef) {
-    return false;
-  }
-
-  if (!MRI->hasOneDef(Reg)) {
-    DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo())
-                 << " has multiple defs\n");
-  }
-
-  assert(MRI->hasOneDef(Reg) && "Register has multiple definitions");
-  MachineOperand *Def = &(*(MRI->def_begin(Reg)));
-  return contains(Def->getParent()->getParent());
-}
-
 // After the code has been structurized, what was flagged as kills
 // before are no longer register kills.
 void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) {
@@ -1092,19 +1056,6 @@ LinearizedRegion::LinearizedRegion(Machi
   Parent = nullptr;
 }
 
-LinearizedRegion::LinearizedRegion(RegionMRT *Region,
-                                   const MachineRegisterInfo *MRI,
-                                   const TargetRegisterInfo *TRI,
-                                   PHILinearize &PHIInfo) {
-  setEntry(Region->getEntry());
-  // We don't have a single exit block that is part of the region
-  // at this point. When the transform is performed this block
-  // will be created.
-  setExit(nullptr);
-  storeLiveOuts(Region, MRI, TRI, PHIInfo);
-  Parent = nullptr;
-}
-
 LinearizedRegion::LinearizedRegion() {
   setEntry(nullptr);
   setExit(nullptr);
@@ -1138,7 +1089,6 @@ private:
                                      SmallVector<unsigned, 2> *RegionIndices);
 
   void extractKilledPHIs(MachineBasicBlock *MBB);
-  void extractKilledPHIs(LinearizedRegion *LRegion);
 
   bool shrinkPHI(MachineInstr &PHI, SmallVector<unsigned, 2> &PHIIndices,
                  unsigned *ReplaceReg);
@@ -1171,10 +1121,6 @@ private:
 
   void transformSimpleIfRegion(RegionMRT *Region);
 
-  bool regionIsSimpleLoop(RegionMRT *Region);
-
-  void transformSimpleLoopRegion(RegionMRT *Region);
-
   void eliminateDeadBranchOperands(MachineBasicBlock::instr_iterator &II);
 
   void insertUnconditionalBranch(MachineBasicBlock *MBB,
@@ -1183,9 +1129,6 @@ private:
 
   MachineBasicBlock *createLinearizedExitBlock(RegionMRT *Region);
 
-  void replaceRegisterOutsideMBB(MachineBasicBlock *MBB, unsigned Register,
-                                 unsigned NewRegister);
-
   void insertMergePHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB,
                       MachineBasicBlock *MergeBB, unsigned DestRegister,
                       unsigned IfSourceRegister, unsigned CodeSourceRegister,
@@ -1313,53 +1256,6 @@ void AMDGPUMachineCFGStructurizer::trans
   TII->convertNonUniformIfRegion(Entry, Exit);
 }
 
-bool AMDGPUMachineCFGStructurizer::regionIsSimpleLoop(RegionMRT *Region) {
-  MachineBasicBlock *Entry = Region->getEntry();
-
-  if (Entry->succ_size() != 1) {
-    return false;
-  }
-
-  int NumRegionExitEdges = 0;
-  MachineBasicBlock *BackBlock = nullptr;
-  for (MachineBasicBlock::const_pred_iterator PI = Entry->succ_begin(),
-                                              PE = Entry->succ_end();
-       PI != PE; ++PI) {
-    MachineBasicBlock *CurrentSP = *PI;
-    if (Region->contains(CurrentSP)) {
-      BackBlock = CurrentSP;
-    }
-  }
-
-  bool HasBackedge = false;
-
-  for (MachineBasicBlock::const_succ_iterator SI = Entry->succ_begin(),
-                                              SE = Entry->succ_end();
-       SI != SE; ++SI) {
-    MachineBasicBlock *CurrentBS = *SI;
-    if (CurrentBS == Entry) {
-      HasBackedge = true;
-    }
-  }
-
-  return NumRegionExitEdges == 1 && BackBlock->succ_size() == 2 && HasBackedge;
-}
-
-void AMDGPUMachineCFGStructurizer::transformSimpleLoopRegion(RegionMRT *Region) {
-  MachineBasicBlock *Entry = Region->getEntry();
-  MachineBasicBlock *BackBlock = nullptr;
-  for (MachineBasicBlock::const_pred_iterator PI = Entry->succ_begin(),
-                                              PE = Entry->succ_end();
-       PI != PE; ++PI) {
-    MachineBasicBlock *CurrentSP = *PI;
-    if (Region->contains(CurrentSP)) {
-      BackBlock = CurrentSP;
-    }
-  }
-
-  TII->convertNonUniformLoopRegion(Entry, BackBlock);
-}
-
 static void fixMBBTerminator(MachineBasicBlock *MBB) {
 
   if (MBB->succ_size() == 1) {
@@ -1518,11 +1414,6 @@ void AMDGPUMachineCFGStructurizer::extra
   }
 }
 
-void AMDGPUMachineCFGStructurizer::extractKilledPHIs(LinearizedRegion *LRegion) {
-  // PHIs can only exist in the entry block.
-  extractKilledPHIs(LRegion->getEntry());
-}
-
 static bool isPHIRegionIndex(SmallVector<unsigned, 2> PHIRegionIndices,
                              unsigned Index) {
   for (auto i : PHIRegionIndices) {
@@ -1835,28 +1726,6 @@ AMDGPUMachineCFGStructurizer::createLine
   return LastMerge;
 }
 
-void AMDGPUMachineCFGStructurizer::replaceRegisterOutsideMBB(MachineBasicBlock *MBB,
-                                                       unsigned Register,
-                                                       unsigned NewRegister) {
-  assert(Register != NewRegister && "Cannot replace a reg with itself");
-
-  for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register),
-                                         E = MRI->reg_end();
-       I != E;) {
-    MachineOperand &O = *I;
-    ++I;
-    if (O.getParent()->getParent() != MBB) {
-      if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) {
-        llvm_unreachable("Cannot substitute physical registers");
-        // We don't handle physical registers, but if we need to in the future
-        // This is how we do it: O.substPhysReg(NewRegister, *TRI);
-      } else {
-        O.setReg(NewRegister);
-      }
-    }
-  }
-}
-
 void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB,
                                             MachineBasicBlock *CodeBB,
                                             MachineBasicBlock *MergeBB,
@@ -1984,7 +1853,7 @@ MachineBasicBlock *AMDGPUMachineCFGStruc
                     SelectBB->getNumber() /* CodeBBStart->getNumber() */);
   if (&(*(IfBB->getParent()->begin())) == IfBB) {
     TII->materializeImmediate(*IfBB, IfBB->begin(), DL, IfReg,
-			      CodeBBStart->getNumber());
+                              CodeBBStart->getNumber());
   }
   MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true);
   ArrayRef<MachineOperand> Cond(RegOp);
@@ -2021,8 +1890,8 @@ void AMDGPUMachineCFGStructurizer::rewri
     // This is an exit block, hence no successors. We will assign the
     // bb select register to the entry block.
     TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
-			      BBSelectReg,
-			      CodeBB->getParent()->begin()->getNumber());
+                              BBSelectReg,
+                              CodeBB->getParent()->begin()->getNumber());
     insertUnconditionalBranch(CodeBB, MergeBB, DL);
     return;
   }
@@ -2036,15 +1905,15 @@ void AMDGPUMachineCFGStructurizer::rewri
 
   if ((TrueBB != nullptr && FalseBB == nullptr) || (TrueBB == FalseBB)) {
     TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
-			      BBSelectReg, TrueBB->getNumber());
+                              BBSelectReg, TrueBB->getNumber());
   } else {
     const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectReg);
     unsigned TrueBBReg = MRI->createVirtualRegister(RegClass);
     unsigned FalseBBReg = MRI->createVirtualRegister(RegClass);
     TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
-			      TrueBBReg, TrueBB->getNumber());
+                              TrueBBReg, TrueBB->getNumber());
     TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL,
-			      FalseBBReg, FalseBB->getNumber());
+                              FalseBBReg, FalseBB->getNumber());
     ensureCondIsNotKilled(Cond);
     TII->insertVectorSelect(*CodeBB, CodeBB->getFirstTerminator(), DL,
                             BBSelectReg, Cond, TrueBBReg, FalseBBReg);
@@ -2114,7 +1983,7 @@ void AMDGPUMachineCFGStructurizer::inser
     if (IsLastDef) {
       const DebugLoc &DL = IfBB->findDebugLoc(IfBB->getFirstTerminator());
       TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DL,
-				NextDestReg, 0);
+                                NextDestReg, 0);
       PHIInfo.deleteDef(DestReg);
     } else {
       PHIInfo.replaceDef(DestReg, NextDestReg);
@@ -2193,7 +2062,7 @@ void AMDGPUMachineCFGStructurizer::rewri
     DEBUG(dbgs() << "Insertion done.\n");
   }
 
-  PHIInfo.dump(MRI);
+  DEBUG(PHIInfo.dump(MRI));
 }
 
 void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) {
@@ -2246,10 +2115,9 @@ void AMDGPUMachineCFGStructurizer::creat
                                             unsigned DestReg) {
   MachineBasicBlock *Entry = CurrentRegion->getEntry();
   MachineBasicBlock *Exit = CurrentRegion->getExit();
-  MachineBasicBlock *Pred = *(Entry->pred_begin());
 
   DEBUG(dbgs() << "RegionExit: " << Exit->getNumber()
-               << " Pred: " << Pred->getNumber() << "\n");
+               << " Pred: " << (*(Entry->pred_begin()))->getNumber() << "\n");
 
   int NumSources = 0;
   auto SE = PHIInfo.sources_end(DestReg);
@@ -2744,8 +2612,6 @@ bool AMDGPUMachineCFGStructurizer::struc
 
   DEBUG(PHIInfo.dump(MRI));
 
-  auto Entry = Region->getEntry();
-
   SetVector<MRT *> *Children = Region->getChildren();
   DEBUG(dbgs() << "===========If Region Start===============\n");
   if (LRegion->getHasLoop()) {
@@ -2847,8 +2713,8 @@ bool AMDGPUMachineCFGStructurizer::struc
         MRI->createVirtualRegister(MRI->getRegClass(InReg));
     unsigned NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg));
     TII->materializeImmediate(*(LRegion->getEntry()),
-			      LRegion->getEntry()->getFirstTerminator(), DL,
-			      NewInReg, Region->getEntry()->getNumber());
+                              LRegion->getEntry()->getFirstTerminator(), DL,
+                              NewInReg, Region->getEntry()->getNumber());
     // Need to be careful about updating the registers inside the region.
     LRegion->replaceRegisterInsideRegion(InReg, InnerSelectReg, false, MRI);
     DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n");
@@ -2863,7 +2729,7 @@ bool AMDGPUMachineCFGStructurizer::struc
     TII->insertReturn(*LastMerge);
   }
 
-  DEBUG(Entry->getParent()->dump());
+  DEBUG(Region->getEntry()->getParent()->dump());
   DEBUG(LRegion->print(dbgs(), TRI));
   DEBUG(PHIInfo.dump(MRI));
 
@@ -3016,4 +2882,3 @@ bool AMDGPUMachineCFGStructurizer::runOn
 FunctionPass *llvm::createAMDGPUMachineCFGStructurizerPass() {
   return new AMDGPUMachineCFGStructurizer();
 }
-

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp?rev=303111&r1=303110&r2=303111&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp Mon May 15 15:18:37 2017
@@ -118,6 +118,13 @@ static cl::opt<bool> EnableSIInsertWaitc
   cl::desc("Use new waitcnt insertion pass"),
   cl::init(false));
 
+// Option to run late CFG structurizer
+static cl::opt<bool> LateCFGStructurize(
+  "amdgpu-late-structurize",
+  cl::desc("Enable late CFG structurization"),
+  cl::init(false),
+  cl::Hidden);
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -702,11 +709,15 @@ bool GCNPassConfig::addPreISel() {
   // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
   // regions formed by them.
   addPass(&AMDGPUUnifyDivergentExitNodesID);
-  addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+  if (!LateCFGStructurize) {
+    addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+  }
   addPass(createSinkingPass());
   addPass(createSITypeRewriter());
   addPass(createAMDGPUAnnotateUniformValues());
-  addPass(createSIAnnotateControlFlowPass());
+  if (!LateCFGStructurize) {
+    addPass(createSIAnnotateControlFlowPass());
+  }
 
   return false;
 }
@@ -770,6 +781,9 @@ bool GCNPassConfig::addGlobalInstruction
 #endif
 
 void GCNPassConfig::addPreRegAlloc() {
+  if (LateCFGStructurize) {
+    addPass(createAMDGPUMachineCFGStructurizerPass());
+  }
   addPass(createSIWholeQuadModePass());
 }
 

Modified: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt?rev=303111&r1=303110&r2=303111&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt (original)
+++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt Mon May 15 15:18:37 2017
@@ -48,6 +48,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUISelDAGToDAG.cpp
   AMDGPULowerIntrinsics.cpp
   AMDGPUMCInstLower.cpp
+  AMDGPUMachineCFGStructurizer.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUUnifyMetadata.cpp
   AMDGPUOpenCLImageTypeLoweringPass.cpp

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp?rev=303111&r1=303110&r2=303111&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.cpp Mon May 15 15:18:37 2017
@@ -496,6 +496,188 @@ int SIInstrInfo::commuteOpcode(unsigned
   return Opcode;
 }
 
+void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       const DebugLoc &DL, unsigned DestReg,
+                                       int64_t Value) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg);
+  if (RegClass == &AMDGPU::SReg_32RegClass ||
+      RegClass == &AMDGPU::SGPR_32RegClass ||
+      RegClass == &AMDGPU::SReg_32_XM0RegClass ||
+      RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) {
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
+      .addImm(Value);
+    return;
+  }
+
+  if (RegClass == &AMDGPU::SReg_64RegClass ||
+      RegClass == &AMDGPU::SGPR_64RegClass ||
+      RegClass == &AMDGPU::SReg_64_XEXECRegClass) {
+    BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
+      .addImm(Value);
+    return;
+  }
+
+  if (RegClass == &AMDGPU::VGPR_32RegClass) {
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg)
+      .addImm(Value);
+    return;
+  }
+  if (RegClass == &AMDGPU::VReg_64RegClass) {
+    BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg)
+      .addImm(Value);
+    return;
+  }
+
+  unsigned EltSize = 4;
+  unsigned Opcode = AMDGPU::V_MOV_B32_e32;
+  if (RI.isSGPRClass(RegClass)) {
+    if (RI.getRegSizeInBits(*RegClass) > 32) {
+      Opcode =  AMDGPU::S_MOV_B64;
+      EltSize = 8;
+    } else {
+      Opcode = AMDGPU::S_MOV_B32;
+      EltSize = 4;
+    }
+  }
+
+  ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RegClass, EltSize);
+  for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) {
+    int64_t IdxValue = Idx == 0 ? Value : 0;
+
+    MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
+      get(Opcode), RI.getSubReg(DestReg, Idx));
+    Builder.addImm(IdxValue);
+  }
+}
+
+const TargetRegisterClass *
+SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
+  return &AMDGPU::VGPR_32RegClass;
+}
+
+void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator I,
+                                     const DebugLoc &DL, unsigned DstReg,
+                                     ArrayRef<MachineOperand> Cond,
+                                     unsigned TrueReg,
+                                     unsigned FalseReg) const {
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  const TargetRegisterClass *RegClass = MRI.getRegClass(DstReg);
+  assert(RegClass == &AMDGPU::VGPR_32RegClass && "Not a VGPR32 reg");
+
+  if (Cond.size() == 1) {
+    BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+      .addReg(FalseReg)
+      .addReg(TrueReg)
+      .add(Cond[0]);
+  } else if (Cond.size() == 2) {
+    assert(Cond[0].isImm() && "Cond[0] is not an immediate");
+    switch (Cond[0].getImm()) {
+    case SIInstrInfo::SCC_TRUE: {
+      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+        .addImm(-1)
+        .addImm(0);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg)
+        .addReg(SReg);
+      break;
+    }
+    case SIInstrInfo::SCC_FALSE: {
+      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+        .addImm(0)
+        .addImm(-1);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg)
+        .addReg(SReg);
+      break;
+    }
+    case SIInstrInfo::VCCNZ: {
+      MachineOperand RegOp = Cond[1];
+      RegOp.setImplicit(false);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+          .addReg(FalseReg)
+          .addReg(TrueReg)
+          .add(RegOp);
+      break;
+    }
+    case SIInstrInfo::VCCZ: {
+      MachineOperand RegOp = Cond[1];
+      RegOp.setImplicit(false);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+          .addReg(TrueReg)
+          .addReg(FalseReg)
+          .add(RegOp);
+      break;
+    }
+    case SIInstrInfo::EXECNZ: {
+      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
+        .addImm(0);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+        .addImm(-1)
+        .addImm(0);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg)
+        .addReg(SReg);
+      break;
+    }
+    case SIInstrInfo::EXECZ: {
+      unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2)
+        .addImm(0);
+      BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg)
+        .addImm(0)
+        .addImm(-1);
+      BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg)
+        .addReg(SReg);
+      llvm_unreachable("Unhandled branch predicate EXECZ");
+      break;
+    }
+    default:
+      llvm_unreachable("invalid branch predicate");
+    }
+  } else {
+    llvm_unreachable("Can only handle Cond size 1 or 2");
+  }
+}
+
+unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
+                               MachineBasicBlock::iterator I,
+                               const DebugLoc &DL,
+                               unsigned SrcReg, int Value) const {
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
+    .addImm(Value)
+    .addReg(SrcReg);
+
+  return Reg;
+}
+
+unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
+                               MachineBasicBlock::iterator I,
+                               const DebugLoc &DL,
+                               unsigned SrcReg, int Value) const {
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
+    .addImm(Value)
+    .addReg(SrcReg);
+
+  return Reg;
+}
+
 unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
 
   if (RI.getRegSizeInBits(*DstRC) == 32) {
@@ -834,6 +1016,20 @@ void SIInstrInfo::insertNoop(MachineBasi
   insertWaitStates(MBB, MI, 1);
 }
 
+void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const {
+  auto MF = MBB.getParent();
+  SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+
+  assert(Info->isEntryFunction());
+
+  if (MBB.succ_empty()) {
+    bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end();
+    if (HasNoTerminator)
+      BuildMI(MBB, MBB.end(), DebugLoc(),
+              get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG));
+  }
+}
+
 unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   default: return 1; // FIXME: Do wait states equal cycles?
@@ -1241,14 +1437,20 @@ bool SIInstrInfo::analyzeBranchImpl(Mach
     return false;
   }
 
-  BranchPredicate Pred = getBranchPredicate(I->getOpcode());
-  if (Pred == INVALID_BR)
-    return true;
+  MachineBasicBlock *CondBB = nullptr;
 
-  MachineBasicBlock *CondBB = I->getOperand(0).getMBB();
-  Cond.push_back(MachineOperand::CreateImm(Pred));
-  Cond.push_back(I->getOperand(1)); // Save the branch register.
+  if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+    CondBB = I->getOperand(1).getMBB();
+    Cond.push_back(I->getOperand(0));
+  } else {
+    BranchPredicate Pred = getBranchPredicate(I->getOpcode());
+    if (Pred == INVALID_BR)
+      return true;
 
+    CondBB = I->getOperand(0).getMBB();
+    Cond.push_back(MachineOperand::CreateImm(Pred));
+    Cond.push_back(I->getOperand(1)); // Save the branch register.
+  }
   ++I;
 
   if (I == MBB.end()) {
@@ -1351,6 +1553,13 @@ unsigned SIInstrInfo::insertBranch(Machi
     return 1;
   }
 
+  if(Cond.size() == 1 && Cond[0].isReg()) {
+     BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO))
+       .add(Cond[0])
+       .addMBB(TBB);
+     return 1;
+  }
+
   assert(TBB && Cond[0].isImm());
 
   unsigned Opcode
@@ -1390,9 +1599,16 @@ unsigned SIInstrInfo::insertBranch(Machi
 
 bool SIInstrInfo::reverseBranchCondition(
   SmallVectorImpl<MachineOperand> &Cond) const {
-  assert(Cond.size() == 2);
-  Cond[0].setImm(-Cond[0].getImm());
-  return false;
+  if (Cond.size() != 2) {
+    return true;
+  }
+
+  if (Cond[0].isImm()) {
+    Cond[0].setImm(-Cond[0].getImm());
+    return false;
+  }
+
+  return true;
 }
 
 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
@@ -3920,6 +4136,82 @@ bool SIInstrInfo::mayAccessFlatAddressSp
   return false;
 }
 
+bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const {
+  return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO;
+}
+
+void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
+                                            MachineBasicBlock *IfEnd) const {
+  MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator();
+  assert(TI != IfEntry->end());
+
+  MachineInstr *Branch = &(*TI);
+  MachineFunction *MF = IfEntry->getParent();
+  MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo();
+
+  if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+    unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    MachineInstr *SIIF =
+        BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg)
+            .add(Branch->getOperand(0))
+            .add(Branch->getOperand(1));
+    MachineInstr *SIEND =
+        BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF))
+            .addReg(DstReg);
+
+    IfEntry->erase(TI);
+    IfEntry->insert(IfEntry->end(), SIIF);
+    IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND);
+  }
+}
+
+void SIInstrInfo::convertNonUniformLoopRegion(
+    MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const {
+  MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator();
+  // We expect 2 terminators, one conditional and one unconditional.
+  assert(TI != LoopEnd->end());
+
+  MachineInstr *Branch = &(*TI);
+  MachineFunction *MF = LoopEnd->getParent();
+  MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo();
+
+  if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) {
+
+    unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    MachineInstrBuilder HeaderPHIBuilder =
+        BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg);
+    for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(),
+                                          E = LoopEntry->pred_end();
+         PI != E; ++PI) {
+      if (*PI == LoopEnd) {
+        HeaderPHIBuilder.addReg(BackEdgeReg);
+      } else {
+        MachineBasicBlock *PMBB = *PI;
+        unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+        materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(),
+                             ZeroReg, 0);
+        HeaderPHIBuilder.addReg(ZeroReg);
+      }
+      HeaderPHIBuilder.addMBB(*PI);
+    }
+    MachineInstr *HeaderPhi = HeaderPHIBuilder;
+    MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(),
+                                      get(AMDGPU::SI_IF_BREAK), BackEdgeReg)
+                                  .addReg(DstReg)
+                                  .add(Branch->getOperand(0));
+    MachineInstr *SILOOP =
+        BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP))
+            .addReg(BackEdgeReg)
+            .addMBB(LoopEntry);
+
+    LoopEntry->insert(LoopEntry->begin(), HeaderPhi);
+    LoopEnd->erase(TI);
+    LoopEnd->insert(LoopEnd->end(), SIIFBREAK);
+    LoopEnd->insert(LoopEnd->end(), SILOOP);
+  }
+}
+
 ArrayRef<std::pair<int, const char *>>
 SIInstrInfo::getSerializableTargetIndices() const {
   static const std::pair<int, const char *> TargetIndices[] = {

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h?rev=303111&r1=303110&r2=303111&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstrInfo.h Mon May 15 15:18:37 2017
@@ -143,6 +143,23 @@ public:
                                     RegScavenger *RS, unsigned TmpReg,
                                     unsigned Offset, unsigned Size) const;
 
+  void materializeImmediate(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MI,
+                            const DebugLoc &DL,
+                            unsigned DestReg,
+                            int64_t Value) const;
+
+  const TargetRegisterClass *getPreferredSelectRegClass(
+                               unsigned Size) const;
+
+  unsigned insertNE(MachineBasicBlock *MBB,
+                    MachineBasicBlock::iterator I, const DebugLoc &DL,
+                    unsigned SrcReg, int Value)  const;
+
+  unsigned insertEQ(MachineBasicBlock *MBB,
+                    MachineBasicBlock::iterator I, const DebugLoc &DL,
+                    unsigned SrcReg, int Value)  const;
+
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI, unsigned SrcReg,
                            bool isKill, int FrameIndex,
@@ -193,7 +210,7 @@ public:
   bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
-                     bool AllowModify) const override;
+                     bool AllowModify = false) const override;
 
   unsigned removeBranch(MachineBasicBlock &MBB,
                         int *BytesRemoved = nullptr) const override;
@@ -218,6 +235,11 @@ public:
                     unsigned DstReg, ArrayRef<MachineOperand> Cond,
                     unsigned TrueReg, unsigned FalseReg) const override;
 
+  void insertVectorSelect(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator I, const DebugLoc &DL,
+                          unsigned DstReg, ArrayRef<MachineOperand> Cond,
+                          unsigned TrueReg, unsigned FalseReg) const;
+
   bool
   areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
                                   AliasAnalysis *AA = nullptr) const override;
@@ -705,6 +727,7 @@ public:
   void insertNoop(MachineBasicBlock &MBB,
                   MachineBasicBlock::iterator MI) const override;
 
+  void insertReturn(MachineBasicBlock &MBB) const;
   /// \brief Return the number of wait states that result from executing this
   /// instruction.
   unsigned getNumWaitStates(const MachineInstr &MI) const;
@@ -750,6 +773,14 @@ public:
 
   bool mayAccessFlatAddressSpace(const MachineInstr &MI) const;
 
+  bool isNonUniformBranchInstr(MachineInstr &Instr) const;
+
+  void convertNonUniformIfRegion(MachineBasicBlock *IfEntry,
+                                 MachineBasicBlock *IfEnd) const;
+
+  void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry,
+                                   MachineBasicBlock *LoopEnd) const;
+
   ArrayRef<std::pair<int, const char *>>
   getSerializableTargetIndices() const override;
 

Modified: llvm/trunk/lib/Target/AMDGPU/SIInstructions.td
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/SIInstructions.td?rev=303111&r1=303110&r2=303111&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/SIInstructions.td (original)
+++ llvm/trunk/lib/Target/AMDGPU/SIInstructions.td Mon May 15 15:18:37 2017
@@ -174,6 +174,13 @@ def SI_MASK_BRANCH : VPseudoInstSI <
 
 let isTerminator = 1 in {
 
+ def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI <
+  (outs),
+  (ins SReg_64:$vcc, brtarget:$target),
+  [(brcond i1:$vcc, bb:$target)]> {
+    let Size = 12;
+}
+
 def SI_IF: CFPseudoInstSI <
   (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
   [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {




More information about the llvm-commits mailing list