[llvm] [AMDGPU] Add IR LiveReg type-based optimization (PR #66838)

Wed Jun 12 05:01:22 PDT 2024

================
@@ -102,14 +144,248 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
 
+  // "Optimize" the virtual regs that cross basic block boundaries. When
+  // building the SelectionDAG, vectors of illegal types that cross basic blocks
+  // will be scalarized and widened, with each scalar living in its
+  // own register. To work around this, this optimization converts the
+  // vectors to equivalent vectors of legal type (which are converted back
+  // before uses in subsequent blocks), to pack the bits into fewer physical
+  // registers (used in CopyToReg/CopyFromReg pairs).
+  LiveRegOptimizer LRO(Mod);
+
   bool Changed = false;
+
   for (auto &BB : F)
-    for (Instruction &I : llvm::make_early_inc_range(BB))
+    for (Instruction &I : make_early_inc_range(BB)) {
       Changed |= visit(I);
+      Changed |= LRO.optimizeLiveType(&I);
+    }
 
+  LRO.removeDeadInstrs();
   return Changed;
 }
 
+Type *LiveRegOptimizer::calculateConvertType(Type *OriginalType) {
+  assert(OriginalType->getScalarSizeInBits() <=
+         ConvertToScalar->getScalarSizeInBits());
+
+  FixedVectorType *VTy = dyn_cast<FixedVectorType>(OriginalType);
+  if (!VTy)
+    return nullptr;
+
+  TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
+  TypeSize ConvertScalarSize = DL->getTypeSizeInBits(ConvertToScalar);
+  unsigned ConvertEltCount =
+      (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
+
+  if (OriginalSize <= ConvertScalarSize)
+    return IntegerType::get(Mod->getContext(), ConvertScalarSize);
+
+  return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
+                         ElementCount::getFixed(ConvertEltCount));
+}
+
+Value *LiveRegOptimizer::convertToOptType(Instruction *V,
+                                          BasicBlock::iterator &InsertPt) {
+  VectorType *VTy = cast<VectorType>(V->getType());
+  Type *NewTy = calculateConvertType(V->getType());
+  if (!NewTy)
+    return nullptr;
+
+  TypeSize OriginalSize = DL->getTypeSizeInBits(VTy);
+  TypeSize NewSize = DL->getTypeSizeInBits(NewTy);
+
+  IRBuilder<> Builder(V->getParent(), InsertPt);
+  // If there is a bitsize match, we can fit the old vector into a new vector of
+  // desired type.
+  if (OriginalSize == NewSize)
+    return cast<Instruction>(
+        Builder.CreateBitCast(V, NewTy, V->getName() + ".bc"));
+
+  // If there is a bitsize mismatch, we must use a wider vector.
+  assert(NewSize > OriginalSize);
+  ElementCount ExpandedVecElementCount =
+      ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
+
+  SmallVector<int, 8> ShuffleMask;
+  for (unsigned I = 0; I < VTy->getElementCount().getFixedValue(); I++)
+    ShuffleMask.push_back(I);
+
+  for (uint64_t I = VTy->getElementCount().getFixedValue();
+       I < ExpandedVecElementCount.getFixedValue(); I++)
+    ShuffleMask.push_back(VTy->getElementCount().getFixedValue());
+
+  Instruction *ExpandedVec =
+      cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
+  return cast<Instruction>(
+      Builder.CreateBitCast(ExpandedVec, NewTy, V->getName() + ".bc"));
+}
+
+Value *LiveRegOptimizer::convertFromOptType(Type *ConvertType, Instruction *V,
+                                            BasicBlock::iterator &InsertPt,
+                                            BasicBlock *InsertBB) {
+  VectorType *NewVTy = cast<VectorType>(ConvertType);
+
+  TypeSize OriginalSize = DL->getTypeSizeInBits(V->getType());
+  TypeSize NewSize = DL->getTypeSizeInBits(NewVTy);
+
+  IRBuilder<> Builder(InsertBB, InsertPt);
+  // If there is a bitsize match, we simply convert back to the original type.
+  if (OriginalSize == NewSize) {
+    return cast<Instruction>(
+        Builder.CreateBitCast(V, NewVTy, V->getName() + ".bc"));
+  }
+
+  // If there is a bitsize mismatch, then we must have used a wider value to
+  // hold the bits.
+  assert(OriginalSize > NewSize);
+  // For wide scalars, we can just truncate the value.
+  if (!V->getType()->isVectorTy()) {
+    Instruction *Trunc = cast<Instruction>(
+        Builder.CreateTrunc(V, IntegerType::get(Mod->getContext(), NewSize)));
+    return cast<Instruction>(Builder.CreateBitCast(Trunc, NewVTy));
+  }
+
+  // For wider vectors, we must strip the MSBs to convert back to the original
+  // type.
+  ElementCount ExpandedVecElementCount =
+      ElementCount::getFixed(OriginalSize / NewVTy->getScalarSizeInBits());
+  VectorType *ExpandedVT = VectorType::get(
+      Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
+      ExpandedVecElementCount);
+  Instruction *Converted =
+      cast<Instruction>(Builder.CreateBitCast(V, ExpandedVT));
+
+  unsigned NarrowElementCount = NewVTy->getElementCount().getFixedValue();
+  SmallVector<int, 8> ShuffleMask(NarrowElementCount);
+  std::iota(ShuffleMask.begin(), ShuffleMask.end(), 0);
+
+  return cast<Instruction>(Builder.CreateShuffleVector(Converted, ShuffleMask));
+}
+
+bool LiveRegOptimizer::optimizeLiveType(Instruction *I) {
+  SmallVector<Instruction *, 4> Worklist;
+  SmallPtrSet<PHINode *, 4> PhiNodes;
+  SmallPtrSet<Instruction *, 4> Defs;
+  SmallPtrSet<Instruction *, 4> Uses;
+
+  Worklist.push_back(cast<Instruction>(I));
+  while (!Worklist.empty()) {
+    Instruction *II = Worklist.pop_back_val();
+    if (Visited.count(II))
+      continue;
+    Visited.insert(II);
+
+    Type *ITy = II->getType();
+    // Only vectors of illegal type will be scalarized when building the
+    // selection DAG.
+    bool ShouldReplace = ITy->isVectorTy() && ITy->getScalarSizeInBits() < 16 &&
+                         !ITy->getScalarType()->isPointerTy();
+
+    if (!ShouldReplace)
+      continue;
+
+    if (auto *Phi = dyn_cast<PHINode>(II)) {
+      PhiNodes.insert(Phi);
+      // Collect all the incoming values of problematic PHI nodes.
+      for (Value *V : Phi->incoming_values()) {
+        // Repeat the collection process for newly found PHI nodes.
+        if (auto *OpPhi = dyn_cast<PHINode>(V)) {
+          if (!PhiNodes.count(OpPhi) && !Visited.count(OpPhi))
+            Worklist.push_back(OpPhi);
+          continue;
+        }
+
+        auto IncInst = dyn_cast<Instruction>(V);
----------------
arsenm wrote:

auto *

https://github.com/llvm/llvm-project/pull/66838