[llvm] r359704 - [AMDGPU] gfx1010 GCNRegBankReassign pass

Stanislav Mekhanoshin via llvm-commits llvm-commits at lists.llvm.org
Wed May 1 09:49:31 PDT 2019


Author: rampitec
Date: Wed May  1 09:49:31 2019
New Revision: 359704

URL: http://llvm.org/viewvc/llvm-project?rev=359704&view=rev
Log:
[AMDGPU] gfx1010 GCNRegBankReassign pass

Reassign registers to reduce register bank conflicts.

Differential Revision: https://reviews.llvm.org/D61344

Added:
    llvm/trunk/lib/Target/AMDGPU/GCNRegBankReassign.cpp
    llvm/trunk/test/CodeGen/AMDGPU/regbank-reassign.mir
Modified:
    llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
    llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
    llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPU.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPU.h?rev=359704&r1=359703&r2=359704&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPU.h (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPU.h Wed May  1 09:49:31 2019
@@ -221,6 +221,9 @@ ModulePass *createAMDGPUOpenCLEnqueuedBl
 void initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(PassRegistry &);
 extern char &AMDGPUOpenCLEnqueuedBlockLoweringID;
 
+void initializeGCNRegBankReassignPass(PassRegistry &);
+extern char &GCNRegBankReassignID;
+
 void initializeGCNNSAReassignPass(PassRegistry &);
 extern char &GCNNSAReassignID;
 

Modified: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp?rev=359704&r1=359703&r2=359704&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp (original)
+++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp Wed May  1 09:49:31 2019
@@ -234,6 +234,7 @@ extern "C" void LLVMInitializeAMDGPUTarg
   initializeAMDGPUUseNativeCallsPass(*PR);
   initializeAMDGPUSimplifyLibCallsPass(*PR);
   initializeAMDGPUInlinerPass(*PR);
+  initializeGCNRegBankReassignPass(*PR);
   initializeGCNNSAReassignPass(*PR);
 }
 
@@ -937,6 +938,7 @@ void GCNPassConfig::addOptimizedRegAlloc
 bool GCNPassConfig::addPreRewrite() {
   if (EnableRegReassign) {
     addPass(&GCNNSAReassignID);
+    addPass(&GCNRegBankReassignID);
   }
   return true;
 }

Modified: llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt?rev=359704&r1=359703&r2=359704&view=diff
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt (original)
+++ llvm/trunk/lib/Target/AMDGPU/CMakeLists.txt Wed May  1 09:49:31 2019
@@ -116,6 +116,7 @@ add_llvm_target(AMDGPUCodeGen
   SIShrinkInstructions.cpp
   SIWholeQuadMode.cpp
   GCNILPSched.cpp
+  GCNRegBankReassign.cpp
   GCNNSAReassign.cpp
   GCNDPPCombine.cpp
   SIModeRegister.cpp

Added: llvm/trunk/lib/Target/AMDGPU/GCNRegBankReassign.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/AMDGPU/GCNRegBankReassign.cpp?rev=359704&view=auto
==============================================================================
--- llvm/trunk/lib/Target/AMDGPU/GCNRegBankReassign.cpp (added)
+++ llvm/trunk/lib/Target/AMDGPU/GCNRegBankReassign.cpp Wed May  1 09:49:31 2019
@@ -0,0 +1,797 @@
+//===-- GCNRegBankReassign.cpp - Reassign registers after regalloc --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Try to reassign registers on GFX10+ to reduce register bank
+/// conflicts.
+///
+/// On GFX10 registers are organized in banks. VGPRs have 4 banks assigned in
+/// a round-robin fashion: v0, v4, v8... belong to bank 0. v1, v5, v9... to
+/// bank 1, etc. SGPRs have 8 banks and allocated in pairs, so that s0:s1,
+/// s16:s17, s32:s33 are at bank 0. s2:s3, s18:s19, s34:s35 are at bank 1 etc.
+///
+/// The shader can read one dword from each of these banks once per cycle.
+/// If an instruction has to read more register operands from the same bank
+/// an additional cycle is needed. HW attempts to pre-load registers through
+/// input operand gathering, but a stall cycle may occur if that fails. For
+/// example V_FMA_F32 V111 = V0 + V4 * V8 will need 3 cycles to read operands,
+/// potentially incuring 2 stall cycles.
+///
+/// The pass tries to reassign registers to reduce bank conflicts.
+///
+/// In this pass bank numbers 0-3 are VGPR banks and 4-11 are SGPR banks, so
+/// that 4 has to be subtracted from an SGPR bank number to get the real value.
+/// This also corresponds to bit numbers in bank masks used in the pass.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervals.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/Support/MathExtras.h"
+
+using namespace llvm;
+
+static cl::opt<unsigned> VerifyStallCycles("amdgpu-verify-regbanks-reassign",
+  cl::desc("Verify stall cycles in the regbanks reassign pass"),
+  cl::value_desc("0|1|2"),
+  cl::init(0), cl::Hidden);
+
+#define DEBUG_TYPE "amdgpu-regbanks-reassign"
+
+#define NUM_VGPR_BANKS 4
+#define NUM_SGPR_BANKS 8
+#define NUM_BANKS (NUM_VGPR_BANKS + NUM_SGPR_BANKS)
+#define SGPR_BANK_OFFSET NUM_VGPR_BANKS
+#define VGPR_BANK_MASK 0xf
+#define SGPR_BANK_MASK 0xff0
+#define SGPR_BANK_SHIFTED_MASK (SGPR_BANK_MASK >> SGPR_BANK_OFFSET)
+
+STATISTIC(NumStallsDetected,
+          "Number of operand read stalls detected");
+STATISTIC(NumStallsRecovered,
+          "Number of operand read stalls recovered");
+
+namespace {
+
+class GCNRegBankReassign : public MachineFunctionPass {
+
+  class OperandMask {
+  public:
+    OperandMask(unsigned r, unsigned s, unsigned m)
+      : Reg(r), SubReg(s), Mask(m) {}
+    unsigned Reg;
+    unsigned SubReg;
+    unsigned Mask;
+  };
+
+  class Candidate {
+  public:
+    Candidate(MachineInstr *mi, unsigned reg, unsigned freebanks,
+              unsigned weight)
+      : MI(mi), Reg(reg), FreeBanks(freebanks), Weight(weight) {}
+
+    bool operator< (const Candidate& RHS) const { return Weight < RHS.Weight; }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+    void dump(const GCNRegBankReassign *P) const {
+      MI->dump();
+      dbgs() << P->printReg(Reg) << " to banks ";
+      dumpFreeBanks(FreeBanks);
+      dbgs() << " weight " << Weight << '\n';
+    }
+#endif
+
+    MachineInstr *MI;
+    unsigned Reg;
+    unsigned FreeBanks;
+    unsigned Weight;
+  };
+
+  class CandidateList : public std::list<Candidate> {
+  public:
+    // Speedup subsequent sort.
+    void push(const Candidate&& C) {
+      if (C.Weight) push_back(C);
+      else push_front(C);
+    }
+  };
+
+public:
+  static char ID;
+
+public:
+  GCNRegBankReassign() : MachineFunctionPass(ID) {
+    initializeGCNRegBankReassignPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override { return "GCN RegBank Reassign"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineLoopInfo>();
+    AU.addRequired<LiveIntervals>();
+    AU.addRequired<VirtRegMap>();
+    AU.addRequired<LiveRegMatrix>();
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  const GCNSubtarget *ST;
+
+  const MachineRegisterInfo *MRI;
+
+  const SIRegisterInfo *TRI;
+
+  MachineLoopInfo *MLI;
+
+  VirtRegMap *VRM;
+
+  LiveRegMatrix *LRM;
+
+  LiveIntervals *LIS;
+
+  unsigned MaxNumVGPRs;
+
+  unsigned MaxNumSGPRs;
+
+  BitVector RegsUsed;
+
+  SmallVector<OperandMask, 8> OperandMasks;
+
+  CandidateList Candidates;
+
+  const MCPhysReg *CSRegs;
+
+  // Returns bank for a phys reg.
+  unsigned getPhysRegBank(unsigned Reg) const;
+
+  // Return a bit set for each register bank used. 4 banks for VGPRs and
+  // 8 banks for SGPRs.
+  // Registers already processed and recorded in RegsUsed are excluded.
+  // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
+  unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank);
+
+  // Return number of stalls in the instructions.
+  // UsedBanks has bits set for the banks used by all operands.
+  // If Reg and Bank provided substitute the Reg with the Bank.
+  unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks,
+                       unsigned Reg = AMDGPU::NoRegister, int Bank = -1);
+
+  // Return true if register is regular VGPR or SGPR or their tuples.
+  // Returns false for special registers like m0, vcc etc.
+  bool isReassignable(unsigned Reg) const;
+
+  // Check if registers' defs are old and may be pre-loaded.
+  // Returns 0 if both registers are old enough, 1 or 2 if one or both
+  // registers will not likely be pre-loaded.
+  unsigned getOperandGatherWeight(const MachineInstr& MI,
+                                  unsigned Reg1,
+                                  unsigned Reg2,
+                                  unsigned StallCycles) const;
+
+
+  // Find all bank bits in UsedBanks where Mask can be relocated to.
+  unsigned getFreeBanks(unsigned Mask, unsigned UsedBanks) const;
+
+  // Find all bank bits in UsedBanks where Mask can be relocated to.
+  // Bank is relative to the register and not its subregister component.
+  // Returns 0 is a register is not reassignable.
+  unsigned getFreeBanks(unsigned Reg, unsigned SubReg, unsigned Mask,
+                        unsigned UsedBanks) const;
+
+  // Add cadidate instruction to the work list.
+  void collectCandidates(MachineInstr& MI, unsigned UsedBanks,
+                         unsigned StallCycles);
+
+  // Collect cadidate instructions across function. Returns a number stall
+  // cycles detected. Only counts stalls if Collect is false.
+  unsigned collectCandidates(MachineFunction &MF, bool Collect = true);
+
+  // Remove all candidates that read specified register.
+  void removeCandidates(unsigned Reg);
+
+  // Compute stalls within the uses of SrcReg replaced by a register from
+  // Bank. If Bank is -1 does not perform substitution. If Collect is set
+  // candidates are collected and added to work list.
+  unsigned computeStallCycles(unsigned SrcReg,
+                              unsigned Reg = AMDGPU::NoRegister,
+                              int Bank = -1, bool Collect = false);
+
+  // Search for a register in Bank unused within LI.
+  // Returns phys reg or NoRegister.
+  unsigned scavengeReg(LiveInterval& LI, unsigned Bank) const;
+
+  // Try to reassign candidate. Returns number or stall cycles saved.
+  unsigned tryReassign(Candidate &C);
+
+  bool verifyCycles(MachineFunction &MF,
+                    unsigned OriginalCycles, unsigned CyclesSaved);
+
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+public:
+  Printable printReg(unsigned Reg, unsigned SubReg = 0) const {
+    return Printable([Reg, SubReg, this](raw_ostream &OS) {
+      if (TargetRegisterInfo::isPhysicalRegister(Reg)) {
+        OS << llvm::printReg(Reg, TRI);
+        return;
+      }
+      if (!VRM->isAssignedReg(Reg))
+        OS << "<unassigned> " << llvm::printReg(Reg, TRI);
+      else
+        OS << llvm::printReg(Reg, TRI) << '('
+           << llvm::printReg(VRM->getPhys(Reg), TRI) << ')';
+      if (SubReg)
+        OS << ':' << TRI->getSubRegIndexName(SubReg);
+    });
+  }
+
+  static Printable printBank(unsigned Bank) {
+    return Printable([Bank](raw_ostream &OS) {
+      OS << ((Bank >= SGPR_BANK_OFFSET) ? Bank - SGPR_BANK_OFFSET : Bank);
+    });
+  }
+
+  static void dumpFreeBanks(unsigned FreeBanks) {
+    for (unsigned L = 0; L < NUM_BANKS; ++L)
+      if (FreeBanks & (1 << L))
+        dbgs() << printBank(L) << ' ';
+  }
+#endif
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS_BEGIN(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
+INITIALIZE_PASS_DEPENDENCY(LiveRegMatrix)
+INITIALIZE_PASS_END(GCNRegBankReassign, DEBUG_TYPE, "GCN RegBank Reassign",
+                    false, false)
+
+
+char GCNRegBankReassign::ID = 0;
+
+char &llvm::GCNRegBankReassignID = GCNRegBankReassign::ID;
+
+unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
+  assert (TargetRegisterInfo::isPhysicalRegister(Reg));
+
+  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+  unsigned Size = TRI->getRegSizeInBits(*RC);
+  if (Size > 32)
+    Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+
+  if (TRI->hasVGPRs(RC)) {
+    Reg -= AMDGPU::VGPR0;
+    return Reg % NUM_VGPR_BANKS;
+  }
+
+  Reg = TRI->getEncodingValue(Reg) / 2;
+  return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
+}
+
+unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
+                                            int Bank) {
+  if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+    if (!VRM->isAssignedReg(Reg))
+      return 0;
+
+    Reg = VRM->getPhys(Reg);
+    if (!Reg)
+      return 0;
+    if (SubReg)
+      Reg = TRI->getSubReg(Reg, SubReg);
+  }
+
+  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+  unsigned Size = TRI->getRegSizeInBits(*RC) / 32;
+  if (Size > 1)
+    Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+
+  if (TRI->hasVGPRs(RC)) {
+    // VGPRs have 4 banks assigned in a round-robin fashion.
+    Reg -= AMDGPU::VGPR0;
+    unsigned Mask = (1 << Size) - 1;
+    unsigned Used = 0;
+    // Bitmask lacks an extract method
+    for (unsigned I = 0; I < Size; ++I)
+      if (RegsUsed.test(Reg + I))
+        Used |= 1 << I;
+    RegsUsed.set(Reg, Reg + Size);
+    Mask &= ~Used;
+    Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank);
+    return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
+  }
+
+  // SGPRs have 8 banks holding 2 consequitive registers each.
+  Reg = TRI->getEncodingValue(Reg) / 2;
+  unsigned StartBit = AMDGPU::VGPR_32RegClass.getNumRegs();
+  if (Reg + StartBit >= RegsUsed.size())
+    return 0;
+
+  if (Size > 1)
+    Size /= 2;
+  unsigned Mask = (1 << Size) - 1;
+  unsigned Used = 0;
+  for (unsigned I = 0; I < Size; ++I)
+    if (RegsUsed.test(StartBit + Reg + I))
+      Used |= 1 << I;
+  RegsUsed.set(StartBit + Reg, StartBit + Reg + Size);
+  Mask &= ~Used;
+  Mask <<= (Bank == -1) ? Reg % NUM_SGPR_BANKS
+                        : unsigned(Bank - SGPR_BANK_OFFSET);
+  Mask = (Mask | (Mask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
+  // Reserve 4 bank ids for VGPRs.
+  return Mask << SGPR_BANK_OFFSET;
+}
+
+unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI,
+                                         unsigned& UsedBanks,
+                                         unsigned Reg,
+                                         int Bank) {
+  unsigned StallCycles = 0;
+  UsedBanks = 0;
+
+  if (MI.isDebugValue())
+    return 0;
+
+  RegsUsed.reset();
+  OperandMasks.clear();
+  for (const auto& Op : MI.explicit_uses()) {
+    // Undef can be assigned to any register, so two vregs can be assigned
+    // the same phys reg within the same instruction.
+    if (!Op.isReg() || Op.isUndef())
+      continue;
+
+    unsigned R = Op.getReg();
+    unsigned ShiftedBank = Bank;
+
+    if (Bank != -1 && R == Reg && Op.getSubReg()) {
+      unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger();
+      if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) {
+        // If a register spans all banks we cannot shift it to avoid conflict.
+        if (countPopulation(LM) >= NUM_VGPR_BANKS)
+          continue;
+        ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS;
+      } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) {
+        // If a register spans all banks we cannot shift it to avoid conflict.
+        if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS)
+          continue;
+        ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET +
+                                          (countTrailingZeros(LM) >> 1)) %
+                                             NUM_SGPR_BANKS;
+      }
+    }
+
+    unsigned Mask = getRegBankMask(R, Op.getSubReg(),
+                                   (Reg == R) ? ShiftedBank : -1);
+    StallCycles += countPopulation(UsedBanks & Mask);
+    UsedBanks |= Mask;
+    OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
+  }
+
+  return StallCycles;
+}
+
+unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
+                                                    unsigned Reg1,
+                                                    unsigned Reg2,
+                                                    unsigned StallCycles) const
+{
+  unsigned Defs = 0;
+  MachineBasicBlock::const_instr_iterator Def(MI.getIterator());
+  MachineBasicBlock::const_instr_iterator B(MI.getParent()->instr_begin());
+  for (unsigned S = StallCycles; S && Def != B && Defs != 3; --S) {
+    if (MI.isDebugInstr())
+      continue;
+    --Def;
+    if (Def->getOpcode() == TargetOpcode::IMPLICIT_DEF)
+      continue;
+    if (Def->modifiesRegister(Reg1, TRI))
+      Defs |= 1;
+    if (Def->modifiesRegister(Reg2, TRI))
+      Defs |= 2;
+  }
+  return countPopulation(Defs);
+}
+
+bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
+  if (TargetRegisterInfo::isPhysicalRegister(Reg) || !VRM->isAssignedReg(Reg))
+    return false;
+
+  const MachineInstr *Def = MRI->getUniqueVRegDef(Reg);
+
+  unsigned PhysReg = VRM->getPhys(Reg);
+
+  if (Def && Def->isCopy() && Def->getOperand(1).getReg() == PhysReg)
+    return false;
+
+  for (auto U : MRI->use_nodbg_operands(Reg)) {
+    if (U.isImplicit())
+      return false;
+    const MachineInstr *UseInst = U.getParent();
+    if (UseInst->isCopy() && UseInst->getOperand(0).getReg() == PhysReg)
+      return false;
+  }
+
+  const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
+  if (TRI->hasVGPRs(RC))
+    return true;
+
+  unsigned Size = TRI->getRegSizeInBits(*RC);
+  if (Size > 32)
+    PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
+
+  return AMDGPU::SGPR_32RegClass.contains(PhysReg);
+}
+
+unsigned GCNRegBankReassign::getFreeBanks(unsigned Mask,
+                                          unsigned UsedBanks) const {
+  unsigned Size = countPopulation(Mask);
+  unsigned FreeBanks = 0;
+  unsigned Bank = findFirstSet(Mask);
+
+  UsedBanks &= ~Mask;
+
+  // Find free VGPR banks
+  if ((Mask & VGPR_BANK_MASK) && (Size < NUM_VGPR_BANKS)) {
+    for (unsigned I = 0; I < NUM_VGPR_BANKS; ++I) {
+      if (Bank == I)
+        continue;
+      unsigned NewMask = ((1 << Size) - 1) << I;
+      NewMask = (NewMask | (NewMask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
+      if (!(UsedBanks & NewMask))
+        FreeBanks |= 1 << I;
+    }
+    return FreeBanks;
+  }
+
+  // Find free SGPR banks
+  // SGPR tuples must be aligned, so step is size in banks it
+  // crosses.
+  Bank -= SGPR_BANK_OFFSET;
+  for (unsigned I = 0; I < NUM_SGPR_BANKS; I += Size) {
+    if (Bank == I)
+      continue;
+    unsigned NewMask = ((1 << Size) - 1) << I;
+    NewMask = (NewMask | (NewMask >> NUM_SGPR_BANKS)) & SGPR_BANK_SHIFTED_MASK;
+    if (!(UsedBanks & (NewMask << SGPR_BANK_OFFSET)))
+      FreeBanks |= (1 << SGPR_BANK_OFFSET) << I;
+  }
+
+  return FreeBanks;
+}
+
+unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg,
+                                          unsigned SubReg,
+                                          unsigned Mask,
+                                          unsigned UsedBanks) const {
+  if (!isReassignable(Reg))
+    return 0;
+
+  unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
+
+  unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger();
+  if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) {
+    unsigned Shift = countTrailingZeros(LM);
+    if (Shift >= NUM_VGPR_BANKS)
+      return 0;
+    unsigned VB = FreeBanks & VGPR_BANK_MASK;
+    FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
+                VGPR_BANK_MASK;
+  } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) {
+    unsigned Shift = countTrailingZeros(LM) >> 1;
+    if (Shift >= NUM_SGPR_BANKS)
+      return 0;
+    unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
+    FreeBanks = ((SB >> Shift) | (SB << (NUM_SGPR_BANKS - Shift))) &
+                SGPR_BANK_SHIFTED_MASK;
+    FreeBanks <<= SGPR_BANK_OFFSET;
+  }
+
+  LLVM_DEBUG(if (FreeBanks) {
+          dbgs() << "Potential reassignments of " << printReg(Reg, SubReg)
+                 << " to banks: "; dumpFreeBanks(FreeBanks);
+          dbgs() << '\n'; });
+
+  return FreeBanks;
+}
+
+void GCNRegBankReassign::collectCandidates(MachineInstr& MI,
+                                           unsigned UsedBanks,
+                                           unsigned StallCycles) {
+  LLVM_DEBUG(MI.dump());
+
+  if (!StallCycles)
+    return;
+
+  LLVM_DEBUG(dbgs() << "Stall cycles = " << StallCycles << '\n');
+
+  for (unsigned I = 0, E = OperandMasks.size(); I + 1 < E; ++I) {
+    for (unsigned J = I + 1; J != E; ++J) {
+      if (!(OperandMasks[I].Mask & OperandMasks[J].Mask))
+        continue;
+
+      unsigned Reg1 = OperandMasks[I].Reg;
+      unsigned Reg2 = OperandMasks[J].Reg;
+      unsigned SubReg1 = OperandMasks[I].SubReg;
+      unsigned SubReg2 = OperandMasks[J].SubReg;
+      unsigned Mask1 = OperandMasks[I].Mask;
+      unsigned Mask2 = OperandMasks[J].Mask;
+      unsigned Size1 = countPopulation(Mask1);
+      unsigned Size2 = countPopulation(Mask2);
+
+      LLVM_DEBUG(dbgs() << "Conflicting operands: " << printReg(Reg1, SubReg1) <<
+                      " and " << printReg(Reg2, SubReg2) << '\n');
+
+      unsigned Weight = getOperandGatherWeight(MI, Reg1, Reg2, StallCycles);
+      Weight += MLI->getLoopDepth(MI.getParent()) * 10;
+
+      LLVM_DEBUG(dbgs() << "Stall weight = " << Weight << '\n');
+
+      unsigned FreeBanks1 = getFreeBanks(Reg1, SubReg1, Mask1, UsedBanks);
+      unsigned FreeBanks2 = getFreeBanks(Reg2, SubReg2, Mask2, UsedBanks);
+      if (FreeBanks1)
+        Candidates.push(Candidate(&MI, Reg1, FreeBanks1, Weight
+                                    + ((Size2 > Size1) ? 1 : 0)));
+      if (FreeBanks2)
+        Candidates.push(Candidate(&MI, Reg2, FreeBanks2, Weight
+                                    + ((Size1 > Size2) ? 1 : 0)));
+    }
+  }
+}
+
+unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
+                                                unsigned Reg, int Bank,
+                                                bool Collect) {
+  unsigned TotalStallCycles = 0;
+  unsigned UsedBanks = 0;
+  SmallSet<const MachineInstr *, 16> Visited;
+
+  for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
+    if (MI.isBundle())
+      continue;
+    if (!Visited.insert(&MI).second)
+      continue;
+    unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank);
+    TotalStallCycles += StallCycles;
+    if (Collect)
+      collectCandidates(MI, UsedBanks, StallCycles);
+  }
+
+  return TotalStallCycles;
+}
+
+unsigned GCNRegBankReassign::scavengeReg(LiveInterval& LI,
+                                         unsigned Bank) const {
+  const TargetRegisterClass *RC = MRI->getRegClass(LI.reg);
+  unsigned MaxNumRegs = (Bank < NUM_VGPR_BANKS) ? MaxNumVGPRs
+                                                : MaxNumSGPRs;
+  unsigned MaxReg = MaxNumRegs + (Bank < NUM_VGPR_BANKS ? AMDGPU::VGPR0
+                                                        : AMDGPU::SGPR0);
+
+  for (unsigned Reg : RC->getRegisters()) {
+    // Check occupancy limit.
+    if (TRI->isSubRegisterEq(Reg, MaxReg))
+      break;
+
+    if (!MRI->isAllocatable(Reg) || getPhysRegBank(Reg) != Bank)
+      continue;
+
+    for (unsigned I = 0; CSRegs[I]; ++I)
+      if (TRI->isSubRegisterEq(Reg, CSRegs[I]) &&
+          !LRM->isPhysRegUsed(CSRegs[I]))
+        return AMDGPU::NoRegister;
+
+    LLVM_DEBUG(dbgs() << "Trying register " << printReg(Reg) << '\n');
+
+    if (!LRM->checkInterference(LI, Reg))
+      return Reg;
+  }
+
+  return AMDGPU::NoRegister;
+}
+
+unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
+  if (!LIS->hasInterval(C.Reg))
+    return 0;
+
+  LiveInterval &LI = LIS->getInterval(C.Reg);
+  LLVM_DEBUG(dbgs() << "Try reassign " << printReg(C.Reg) << " in "; C.MI->dump();
+             LI.dump());
+
+  // For each candidate bank walk all instructions in the range of live
+  // interval and check if replacing the register with one belonging to
+  // the candidate bank reduces conflicts.
+
+  unsigned OrigStalls = computeStallCycles(C.Reg);
+  LLVM_DEBUG(dbgs() << "--- Stall cycles in range = " << OrigStalls << '\n');
+  if (!OrigStalls)
+    return 0;
+
+  struct BankStall {
+    BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
+    bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; }
+    unsigned Bank;
+    unsigned Stalls;
+  };
+  SmallVector<BankStall, 8> BankStalls;
+
+  for (int Bank = 0; Bank < NUM_BANKS; ++Bank) {
+    if (C.FreeBanks & (1 << Bank)) {
+      LLVM_DEBUG(dbgs() << "Trying bank " << printBank(Bank) << '\n');
+      unsigned Stalls = computeStallCycles(C.Reg, C.Reg, Bank);
+      if (Stalls < OrigStalls) {
+        LLVM_DEBUG(dbgs() << "With bank " << printBank(Bank) << " -> "
+                     << Stalls << '\n');
+        BankStalls.push_back(BankStall((unsigned)Bank, Stalls));
+      }
+    }
+  }
+  std::sort(BankStalls.begin(), BankStalls.end());
+
+  unsigned OrigReg = VRM->getPhys(C.Reg);
+  LRM->unassign(LI);
+  while (!BankStalls.empty()) {
+    BankStall BS = BankStalls.pop_back_val();
+    unsigned Reg = scavengeReg(LI, BS.Bank);
+    if (Reg == AMDGPU::NoRegister) {
+      LLVM_DEBUG(dbgs() << "No free registers in bank " << printBank(BS.Bank)
+                   << '\n');
+      continue;
+    }
+    LLVM_DEBUG(dbgs() << "Found free register " << printReg(Reg)
+                 << (LRM->isPhysRegUsed(Reg) ? "" : " (new)")
+                 << " in bank " << printBank(BS.Bank) << '\n');
+
+    LRM->assign(LI, Reg);
+
+    LLVM_DEBUG(dbgs() << "--- Cycles saved: " << OrigStalls - BS.Stalls << '\n');
+
+    return OrigStalls - BS.Stalls;
+  }
+  LRM->assign(LI, OrigReg);
+
+  return 0;
+}
+
+unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
+                                               bool Collect) {
+  unsigned TotalStallCycles = 0;
+
+  for (MachineBasicBlock &MBB : MF) {
+
+    LLVM_DEBUG(if (Collect) {
+            if (MBB.getName().empty()) dbgs() << "bb." << MBB.getNumber();
+            else dbgs() << MBB.getName(); dbgs() << ":\n";
+          });
+
+    for (MachineInstr &MI : MBB.instrs()) {
+      if (MI.isBundle())
+          continue; // we analyze the instructions inside the bundle individually
+
+      unsigned UsedBanks = 0;
+      unsigned StallCycles = analyzeInst(MI, UsedBanks);
+
+      if (Collect)
+        collectCandidates(MI, UsedBanks, StallCycles);
+
+      TotalStallCycles += StallCycles;
+    }
+
+    LLVM_DEBUG(if (Collect) { dbgs() << '\n'; });
+  }
+
+  return TotalStallCycles;
+}
+
+void GCNRegBankReassign::removeCandidates(unsigned Reg) {
+  Candidates.remove_if([Reg, this](const Candidate& C) {
+    return C.MI->readsRegister(Reg, TRI);
+  });
+}
+
+bool GCNRegBankReassign::verifyCycles(MachineFunction &MF,
+                                      unsigned OriginalCycles,
+                                      unsigned CyclesSaved) {
+  unsigned StallCycles = collectCandidates(MF, false);
+  LLVM_DEBUG(dbgs() << "=== After the pass " << StallCycles
+               << " stall cycles left\n");
+  return StallCycles + CyclesSaved == OriginalCycles;
+}
+
+bool GCNRegBankReassign::runOnMachineFunction(MachineFunction &MF) {
+  ST = &MF.getSubtarget<GCNSubtarget>();
+  if (!ST->hasRegisterBanking() || skipFunction(MF.getFunction()))
+    return false;
+
+  MRI = &MF.getRegInfo();
+  TRI = ST->getRegisterInfo();
+  MLI = &getAnalysis<MachineLoopInfo>();
+  VRM = &getAnalysis<VirtRegMap>();
+  LRM = &getAnalysis<LiveRegMatrix>();
+  LIS = &getAnalysis<LiveIntervals>();
+
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  unsigned Occupancy = MFI->getOccupancy();
+  MaxNumVGPRs = ST->getMaxNumVGPRs(MF);
+  MaxNumSGPRs = ST->getMaxNumSGPRs(MF);
+  MaxNumVGPRs = std::min(ST->getMaxNumVGPRs(Occupancy), MaxNumVGPRs);
+  MaxNumSGPRs = std::min(ST->getMaxNumSGPRs(Occupancy, true), MaxNumSGPRs);
+
+  CSRegs = TRI->getCalleeSavedRegs(&MF);
+
+  RegsUsed.resize(AMDGPU::VGPR_32RegClass.getNumRegs() +
+                  TRI->getEncodingValue(AMDGPU::SGPR_NULL) / 2 + 1);
+
+  LLVM_DEBUG(dbgs() << "=== RegBanks reassign analysis on function " << MF.getName()
+               << '\n');
+
+  unsigned StallCycles = collectCandidates(MF);
+  NumStallsDetected += StallCycles;
+
+  LLVM_DEBUG(dbgs() << "=== " << StallCycles << " stall cycles detected in "
+                  "function " << MF.getName() << '\n');
+
+  Candidates.sort();
+
+  LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
+        for (auto C : Candidates) C.dump(this);
+        dbgs() << "\n\n");
+
+  unsigned CyclesSaved = 0;
+  while (!Candidates.empty()) {
+    Candidate C = Candidates.back();
+    unsigned LocalCyclesSaved = tryReassign(C);
+    CyclesSaved += LocalCyclesSaved;
+
+    if (VerifyStallCycles > 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
+      report_fatal_error("RegBank reassign stall cycles verification failed.");
+
+    Candidates.pop_back();
+    if (LocalCyclesSaved) {
+      removeCandidates(C.Reg);
+      computeStallCycles(C.Reg, AMDGPU::NoRegister, -1, true);
+      Candidates.sort();
+
+      LLVM_DEBUG(dbgs() << "\nCandidates:\n\n";
+            for (auto C : Candidates)
+              C.dump(this);
+            dbgs() << "\n\n");
+    }
+  }
+  NumStallsRecovered += CyclesSaved;
+
+  LLVM_DEBUG(dbgs() << "=== After the pass " << CyclesSaved
+               << " cycles saved in function " << MF.getName() << '\n');
+
+  Candidates.clear();
+
+  if (VerifyStallCycles == 1 && !verifyCycles(MF, StallCycles, CyclesSaved))
+    report_fatal_error("RegBank reassign stall cycles verification failed.");
+
+  RegsUsed.clear();
+
+  return CyclesSaved > 0;
+}

Added: llvm/trunk/test/CodeGen/AMDGPU/regbank-reassign.mir
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/AMDGPU/regbank-reassign.mir?rev=359704&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/AMDGPU/regbank-reassign.mir (added)
+++ llvm/trunk/test/CodeGen/AMDGPU/regbank-reassign.mir Wed May  1 09:49:31 2019
@@ -0,0 +1,336 @@
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass greedy,amdgpu-regbanks-reassign,virtregrewriter -o - %s | FileCheck -check-prefix=GCN %s
+
+# GCN-LABEL: v1_vs_v5{{$}}
+# GCN: V_AND_B32_e32 killed $vgpr3, killed $vgpr1,
+---
+name:            v1_vs_v5
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '$vgpr1' }
+  - { id: 1, class: vgpr_32, preferred-register: '$vgpr5' }
+  - { id: 2, class: vgpr_32 }
+body: |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = V_AND_B32_e32 %1, %0, implicit $exec
+    S_ENDPGM 0
+...
+
+# GCN-LABEL: v0_1_vs_v4{{$}}
+# GCN: GLOBAL_STORE_DWORD killed renamable $vgpr0_vgpr1, killed renamable $vgpr3,
+---
+name:            v0_1_vs_v4
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '$vgpr4' }
+  - { id: 1, class: vreg_64, preferred-register: '$vgpr0_vgpr1' }
+body: |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    GLOBAL_STORE_DWORD %1, %0, 0, 0, 0, 0, implicit $exec
+    S_ENDPGM 0
+...
+
+# GCN-LABEL: v1_2_vs_v4_5{{$}}
+# GCN: GLOBAL_STORE_DWORDX2 killed renamable $vgpr2_vgpr3, killed renamable $vgpr4_vgpr5,
+---
+name:            v1_2_vs_v4_5
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vreg_64, preferred-register: '$vgpr4_vgpr5' }
+  - { id: 1, class: vreg_64, preferred-register: '$vgpr1_vgpr2' }
+body: |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    GLOBAL_STORE_DWORDX2 %1, %0, 0, 0, 0, 0, implicit $exec
+    S_ENDPGM 0
+...
+
+# GCN-LABEL: s0_vs_s16{{$}}
+# GCN: S_AND_B32 killed renamable $sgpr14, $sgpr0,
+---
+name:            s0_vs_s16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' }
+  - { id: 1, class: sgpr_32 }
+body: |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    $sgpr0 = IMPLICIT_DEF
+    %1 = S_AND_B32 %0, $sgpr0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+# GCN-LABEL: s1_vs_s16{{$}}
+# GCN: S_AND_B32 killed renamable $sgpr14, $sgpr1,
+---
+name:            s1_vs_s16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' }
+  - { id: 1, class: sgpr_32 }
+body: |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    $sgpr1 = IMPLICIT_DEF
+    %1 = S_AND_B32 %0, $sgpr1, implicit-def $scc
+    S_ENDPGM 0
+...
+
+# GCN-LABEL: s12_vs_null{{$}}
+# GCN: S_AND_B32 $sgpr_null, killed renamable $sgpr14,
+---
+name:            s12_vs_null
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_32, preferred-register: '$sgpr12' }
+  - { id: 1, class: sgpr_32 }
+body: |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = S_AND_B32 $sgpr_null, %0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+# GCN-LABEL: s13_vs_m0{{$}}
+# GCN: S_AND_B32 $m0, killed renamable $sgpr14,
+---
+name:            s13_vs_m0
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_32, preferred-register: '$sgpr13' }
+  - { id: 1, class: sgpr_32 }
+body: |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = S_AND_B32 $m0, %0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+# GCN-LABEL: s12_13_vs_s28_s29{{$}}
+# GCN: S_AND_B64 $sgpr28_sgpr29, killed renamable $sgpr14_sgpr15,
+---
+name:            s12_13_vs_s28_s29
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_64, preferred-register: '$sgpr12_sgpr13' }
+  - { id: 1, class: sreg_64 }
+body: |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    $sgpr28_sgpr29 = IMPLICIT_DEF
+    %1 = S_AND_B64 $sgpr28_sgpr29, %0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+# GCN-LABEL: livein{{$}}
+# GCN: V_AND_B32_e32 killed $vgpr4, killed $vgpr0,
+---
+name:            livein
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
+  - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' }
+  - { id: 2, class: vgpr_32 }
+liveins:
+  - { reg: '$vgpr0', virtual-reg: '' }
+  - { reg: '$vgpr4', virtual-reg: '' }
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr4
+
+    %0 = COPY $vgpr0
+    %1 = COPY $vgpr4
+    %2 = V_AND_B32_e32 %1, %0, implicit $exec
+    S_ENDPGM 0
+...
+
+# GCN-LABEL: liveout{{$}}
+# GCN: V_AND_B32_e32 $vgpr4, $vgpr0,
+---
+name:            liveout
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
+  - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' }
+  - { id: 2, class: vgpr_32 }
+body: |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = V_AND_B32_e32 %1, %0, implicit $exec
+    $vgpr0 = COPY %0
+    $vgpr4 = COPY %1
+    S_ENDPGM 0
+...
+
+# GCN-LABEL: implicit{{$}}
+# GCN: V_MOV_B32_indirect undef $vgpr4, undef $vgpr0, implicit $exec, implicit-def dead renamable $vgpr0_vgpr1_vgpr2_vgpr3, implicit killed $vgpr4_vgpr5_vgpr6_vgpr7, implicit $m0
+---
+name:            implicit
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vreg_128 }
+  - { id: 1, class: vreg_128, preferred-register: '$vgpr4_vgpr5_vgpr6_vgpr7' }
+body: |
+  bb.0:
+    %1 = IMPLICIT_DEF
+    V_MOV_B32_indirect undef %1.sub0:vreg_128, undef $vgpr0, implicit $exec, implicit-def %0:vreg_128, implicit %1:vreg_128, implicit $m0
+    S_ENDPGM 0
+...
+
+# GCN-LABEL: occupancy_limit{{$}}
+# GCN: V_AND_B32_e32 $vgpr4, $vgpr0,
+---
+name:            occupancy_limit
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
+  - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' }
+  - { id: 2, class: vgpr_32, preferred-register: '$vgpr1' }
+  - { id: 3, class: vreg_64, preferred-register: '$vgpr2_vgpr3' }
+  - { id: 4, class: vgpr_32, preferred-register: '$vgpr5' }
+  - { id: 5, class: vreg_64, preferred-register: '$vgpr6_vgpr7' }
+  - { id: 6, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' }
+  - { id: 7, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' }
+  - { id: 8, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' }
+  - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' }
+body: |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %3 = IMPLICIT_DEF
+    %4 = IMPLICIT_DEF
+    %5 = IMPLICIT_DEF
+    %6 = IMPLICIT_DEF
+    %7 = IMPLICIT_DEF
+    %8 = IMPLICIT_DEF
+    %9 = IMPLICIT_DEF
+    %2 = V_AND_B32_e32 %1, %0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %1, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %2, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %4, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, 0, 0, implicit $exec
+    S_ENDPGM 0
+...
+
+# GCN-LABEL: csr{{$}}
+# GCN: V_AND_B32_e32 $vgpr4, $vgpr0,
+---
+name:            csr
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: vgpr_32, preferred-register: '$vgpr0' }
+  - { id: 1, class: vgpr_32, preferred-register: '$vgpr4' }
+  - { id: 2, class: vgpr_32, preferred-register: '$vgpr1' }
+  - { id: 3, class: vreg_64, preferred-register: '$vgpr2_vgpr3' }
+  - { id: 4, class: vgpr_32, preferred-register: '$vgpr5' }
+  - { id: 5, class: vreg_64, preferred-register: '$vgpr6_vgpr7' }
+  - { id: 6, class: vreg_128, preferred-register: '$vgpr8_vgpr9_vgpr10_vgpr11' }
+  - { id: 7, class: vreg_128, preferred-register: '$vgpr12_vgpr13_vgpr14_vgpr15' }
+  - { id: 8, class: vreg_128, preferred-register: '$vgpr16_vgpr17_vgpr18_vgpr19' }
+  - { id: 9, class: vreg_128, preferred-register: '$vgpr20_vgpr21_vgpr22_vgpr23' }
+  - { id: 10, class: vreg_128, preferred-register: '$vgpr24_vgpr25_vgpr26_vgpr27' }
+  - { id: 11, class: vreg_128, preferred-register: '$vgpr28_vgpr29_vgpr30_vgpr31' }
+  - { id: 12, class: vgpr_32, preferred-register: '$vgpr33' }
+body: |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %3 = IMPLICIT_DEF
+    %4 = IMPLICIT_DEF
+    %5 = IMPLICIT_DEF
+    %6 = IMPLICIT_DEF
+    %7 = IMPLICIT_DEF
+    %8 = IMPLICIT_DEF
+    %9 = IMPLICIT_DEF
+    %10 = IMPLICIT_DEF
+    %11 = IMPLICIT_DEF
+    %12 = IMPLICIT_DEF
+    %2 = V_AND_B32_e32 %1, %0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %0, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %1, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %2, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %4, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX2 %3, %5, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %6, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %7, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %8, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %9, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %10, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORDX4 %3, %11, 0, 0, 0, 0, implicit $exec
+    GLOBAL_STORE_DWORD %3, %12, 0, 0, 0, 0, implicit $exec
+    S_ENDPGM 0
+...
+
+# Do not touch undefs
+# GCN-LABEL: s0_vs_s16_undef{{$}}
+# GCN: S_AND_B32 killed renamable $sgpr16, undef $sgpr0,
+---
+name:            s0_vs_s16_undef
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_32, preferred-register: '$sgpr16' }
+  - { id: 1, class: sgpr_32 }
+body: |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = S_AND_B32 %0, undef $sgpr0, implicit-def $scc
+    S_ENDPGM 0
+...
+
+# GCN-LABEL: smem_bundle{{$}}
+# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr15, 0, 0
+# GCN: S_BUFFER_LOAD_DWORD_SGPR renamable $sgpr0_sgpr1_sgpr2_sgpr3, renamable $sgpr14, 0, 0
+---
+name:          smem_bundle
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_128, preferred-register: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+  - { id: 1, class: sreg_32_xm0_xexec, preferred-register: '$sgpr16' }
+  - { id: 2, class: sreg_32_xm0_xexec, preferred-register: '$sgpr17' }
+  - { id: 3, class: sreg_32_xm0_xexec, preferred-register: '$sgpr4' }
+  - { id: 4, class: sreg_32_xm0_xexec, preferred-register: '$sgpr5' }
+body: |
+  bb.0:
+    %0 = IMPLICIT_DEF
+    %1 = IMPLICIT_DEF
+    %2 = IMPLICIT_DEF
+    early-clobber %3, early-clobber %4 = BUNDLE %0, %1, %2 {
+      %3 = S_BUFFER_LOAD_DWORD_SGPR %0, %1, 0, 0
+      %4 = S_BUFFER_LOAD_DWORD_SGPR %0, %2, 0, 0
+    }
+    S_ENDPGM 0
+...
+
+# GCN-LABEL: vreg_512_subs{{$}}
+# don't care about the assignment: this used to trigger an infinite loop
+---
+name:            vreg_512_subs
+tracksRegLiveness: true
+registers:
+  - { id: 1, class: vreg_512, preferred-register: '$vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15' }
+  - { id: 2, class: vgpr_32, preferred-register: '$vgpr28' }
+body:             |
+  bb.0:
+    %1 = IMPLICIT_DEF
+    %2 = IMPLICIT_DEF
+    DS_WRITE2_B32_gfx9 %2, %1.sub0, %1.sub1, 0, 1, 0, implicit $exec
+    DS_WRITE2_B32_gfx9 %2, %1.sub2, %1.sub3, 2, 3, 0, implicit $exec
+    DS_WRITE2_B32_gfx9 %2, %1.sub4, %1.sub5, 4, 5, 0, implicit $exec
+    DS_WRITE2_B32_gfx9 %2, %1.sub6, %1.sub7, 6, 7, 0, implicit $exec
+    DS_WRITE2_B32_gfx9 %2, %1.sub8, %1.sub9, 8, 9, 0, implicit $exec
+    DS_WRITE2_B32_gfx9 %2, %1.sub10, %1.sub11, 10, 11, 0, implicit $exec
+    DS_WRITE2_B32_gfx9 %2, %1.sub12, %1.sub13, 12, 13, 0, implicit $exec
+    DS_WRITE2_B32_gfx9 %2, %1.sub14, %1.sub15, 14, 15, 0, implicit $exec
+    S_ENDPGM 0
+...




More information about the llvm-commits mailing list