[llvm] [Transforms] Vectorizing Mem2Reg Pass (PR #107688)

Tue Sep 10 16:09:40 PDT 2024

================
@@ -1060,6 +1349,194 @@ void PromoteMem2Reg::ComputeLiveInBlocks(
   }
 }
 
+bool VectorizedMem2Reg::GatherAlloca(AllocaInst *AI, const AllocaInfo &Info) {
+  assert(Allocas.size() < MaxAllocaNum && "Allocas vector is full.");
+  // Add new alloca to the current batch.
+  size_t Index = Allocas.size();
+  Allocas.push_back(AI);
+
+  // Populate DEF states.
+  for (BasicBlock* Def : Info.DefiningBlocks) {
+    // We need to calculate IDF of every DEF block, adding them to the PQ here
+    // so that a BB is only added once at most.
+    if (BlockStates.get<DEF_STATE>(Def->getNumber()).none())
+      if (DomTreeNodeBase<BasicBlock> *Node = DT->getNode(Def))
+        PQ.push({Node, std::make_pair(Node->getLevel(), Node->getDFSNumIn())});
+    BlockStates.get<DEF_STATE>(Def->getNumber()).set(Index);
+  }
+
+  // Initialize Worklist to compute ALIVE state. Find all uses of the value
+  // where it is defined in another block and add them to Worklist.
+  for (BasicBlock *Use : Info.UsingBlocks) {
+    BBNumberTy BN = Use->getNumber();
+
+    // If the use block is not the def block, the use block is live-in. It is
+    // possible that a previous alloca lives in this block, so we should merge
+    // their UPDATE states.
+    if (!BlockStates.get<DEF_STATE>(BN)[Index]) {
+      PushWorkList(BN, AllocaState().set(Index));
+      continue;
+    }
+
+    // If use and def happens in the same block, check if the def occurs before
+    // the use, in this case the value is not live-in at this block.
+    for (BasicBlock::iterator I = Use->begin();; ++I) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        if (SI->getOperand(1) != AI)
+          continue;
+
+        // We found a store to the alloca before a load. The alloca is not
+        // actually live-in here.
+        break;
+      }
+
+      if (LoadInst *LI = dyn_cast<LoadInst>(I))
+        // Okay, we found a load before a store to the alloca.  It is actually
+        // live into this block. Add it to the worklist.
+        if (LI->getOperand(0) == AI) {
+          PushWorkList(BN, AllocaState().set(Index));
+          break;
+        }
+    }
+  }
+
+  return Allocas.size() == MaxAllocaNum;
+}
+
+void VectorizedMem2Reg::Calculate() {
+  // @TODO: Assign BB number in a way such that the BB deepest in the CFG has
+  // the largest number, and traverse it first using a priority queue. This
+  // allows faster convergence to the fixed-point.
+
+  // Compute ALIVE state for every block.
+  // Now Worklist is initialized with blocks containing any live-in allocas. We
+  // recursively add their predecessors to Worklist until we find the full
+  // region where the value is alive. Whenever a block's ALIVE state is updated,
+  // we need to check if the state value is actually modified, if so, we need
+  // to iterate its predecessors again to propagate the new state until reaching
+  // a fixed point.
+  while (!Worklist.empty()) {
+    auto [BN, State] = PopWorkList();
+
+    // Update the ALIVE state of this block. If the state remains unchanged, we
+    // have reached a fixed point, there is no more new liveness info to be
+    // propagated to its predecessors.
+    AllocaState OldState = BlockStates.get<ALIVE_STATE>(BN);
+    AllocaState NewState = (BlockStates.get<ALIVE_STATE>(BN) |= State);
+    if (NewState == OldState)
+      continue;
+
+    // If a fixed point is not reached, this is because either block BN is
+    // visited for the first time, or a loop in the CFG brings new liveness info
+    // back to this block. Either case, we add its predecessors to the worklist.
+    for (BBNumberTy Pred : Predecessors[BN]) {
+      // The value is not ALIVE in a predecessor if it contains a DEF, so we
+      // need to exclude such values, and only find those values not defined in
+      // this block.
+      AllocaState UpdateState = NewState & ~BlockStates.get<DEF_STATE>(Pred);
+      // Only add to Worklist if there is any value ALIVE at Pred.
+      if (UpdateState.any())
+        PushWorkList(Pred, UpdateState);
+    }
+  }
+
+  // Initialize UPDATE states of blocks in PQ to maintain the invaraince. We
+  // calculate IDF of every DEF, so the initial UPDATE state is DEF state.
+  for (auto &Node : GetPQContainer()) {
+    unsigned BN = Node.first->getBlock()->getNumber();
+    BlockStates.get<UPDATE_STATE>(BN) = BlockStates.get<DEF_STATE>(BN);
+  }
+
+  // Compute IDF for every block containing alloca. Visiting blocks from the
+  // largest to the smallest DT level number.
+  while (!PQ.empty()) {
+    // RootState is the values available at Root, which will be propagated to
+    // the successors of its dominatees per the algorithm of IDF.
+    auto [Root, RootState] = PopPQ();
+    unsigned RootLevel = Root->getLevel();
+    BBNumberTy RootBN = Root->getBlock()->getNumber();
+
+    // Perform one iteration of dominance frontier computation on all blocks
+    // dominated by root. Here Worklist is not associated with UPDATE state
+    // because visited nodes are updated with the same RootState instead.
+    Worklist.push_back(RootBN);
+    while (!Worklist.empty()) {
+      unsigned BN = Worklist.back();
+      Worklist.pop_back();
+
+      for (BBNumberTy Succ : Successors[BN]) {
+        auto SuccNode = DT->getNode(Succ);
+        unsigned SuccLevel = SuccNode->getLevel();
+
+        // Successor node Succ with higher level in DT must be dominated by
+        // current node BN, so PHI will not be placed in it.
+        if (SuccLevel > RootLevel)
+          continue;
+
+        // Update IDF state of Succ by merging its previous state with available
+        // values from Root. Values no longer alive need not to be propagated,
+        // so that the algorithm converges faster.
+        AllocaState AliveState = BlockStates.get<ALIVE_STATE>(Succ);
+        AllocaState OldState = BlockStates.get<IDF_STATE>(Succ);
+        AllocaState NewState =
+            (BlockStates.get<IDF_STATE>(Succ) |= (RootState & AliveState));
+        // If IDF state is unchanged, we reached a fixed point, and there will
+        // be no more new value to propagate. This includes the case that no
+        // value from Root is alive at Succ ((RootState & AliveState) == 0).
+        if (NewState == OldState)
+          continue;
+
+        // We always filter UPDATE state with ALIVE state, so it is an invariant
+        // that IDF values are a subset of ALIVE values.
+        assert((AliveState | OldState) == AliveState);
+        assert((AliveState | NewState) == AliveState);
+
+        // Any newly set bit in IDF state represents inserted PHI, add it to the
+        // output.
+        AllocaState Inserted = NewState ^ OldState;
+        do {
+          size_t Index = 0;
+          if constexpr (MaxAllocaNum <= sizeof(unsigned long long) * CHAR_BIT) {
+            Index = llvm::countr_zero(Inserted.to_ullong());
+          } else {
+            while (!Inserted.test(Index))
+              ++Index;
+          }
+          PHIBlocks[Index].push_back(Succ);
+          Inserted.reset(Index);
+        } while (Inserted.any());
+
+        // If any new PHI is inserted at Succ, we need to iterate it too since
+        // it will propagate the PHI to blocks it does not dominate. An existing
+        // value is killed by DEF so the UPDATE state should exclude it.
+        AllocaState UpdateState = NewState & ~BlockStates.get<DEF_STATE>(Succ);
+        if (UpdateState.any())
+          PushPQ(SuccNode, UpdateState);
+      }
+
+      // Visit every node in DT subtree.
+      for (auto DomChild : *(DT->getNode(BN))) {
+        BBNumberTy DomChildBN = DomChild->getBlock()->getNumber();
+        // Since any value available at the dominator is available at the child
+        // node, we merge the dominator's IDF state into it. If the child's IDF
+        // state is unchanged, we reached a fixed point, so we do not need to
+        // visit it. Values no longer alive need not to be propagated, so that
+        // the algorithm converges faster.
+        AllocaState OldState = BlockStates.get<IDF_STATE>(DomChildBN);
+        AllocaState NewState = BlockStates.get<IDF_STATE>(DomChildBN) |=
+            (RootState & BlockStates.get<ALIVE_STATE>(DomChildBN));
+        // Since DT is a tree, there will be no dups in Worklist.
+        if (OldState != NewState)
+          Worklist.push_back(DomChildBN);
+      }
+    }
+  }
+
+  // Order inserted PHI nodes in a deterministic way.
+  for (size_t I = 0; I < Allocas.size(); ++I)
+    llvm::sort(PHIBlocks[I]);
----------------
huangjd wrote:

This is to be consistent with whatever sort used by the scalar algorithm 

https://github.com/llvm/llvm-project/pull/107688