[llvm] VectorWiden pass to widen aleady vectorized instrctions (PR #67029)

Wed Oct 4 02:48:28 PDT 2023

================
@@ -0,0 +1,356 @@
+///==--- VectorWiden.cpp - Combining Vector Operations to wider types ----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to widen vector operations to a wider type, it finds
+// independent from each other operations with a certain vector type as SLP does
+// with scalars by Bottom Up. It detects consecutive stores that can be put
+// together into a wider vector-stores. Next, it attempts to construct
+// vectorizable tree using the use-def chains.
+//
+//==------------------------------------------------------------------------==//
+
+#include "llvm/Transforms/Vectorize/VectorWiden.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/CodeMoverUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vector-widen"
+
+// Due to independant operations to widening that we consider with possibility
+// to merge those operations into one and also to widening store if we find
+// later store instructions. We have to consider the distance between those
+// independent operations or we might introduce bad register pressure, etc.
+
+static cl::opt<unsigned>
+    MaxInstDistance("vw-max-instr-distance", cl::init(30), cl::Hidden,
+                    cl::desc("Maximum distance between instructions to"
+                             "consider to widen"));
+
+namespace {
+class VectorWiden {
+public:
+  using InstrList = SmallVector<Instruction *, 2>;
+  using ValueList = SmallVector<Value *, 2>;
+  VectorWiden(Function &F, const TargetTransformInfo &TTI, DominatorTree &DT,
+              DependenceInfo &DI, const PostDominatorTree &PDT)
+      : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), DI(DI), PDT(PDT) {}
+
+  bool run();
+
+private:
+  Function &F;
+  IRBuilder<> Builder;
+  const TargetTransformInfo &TTI;
+  DominatorTree &DT;
+  DependenceInfo &DI;
+  const PostDominatorTree &PDT;
+  TargetLibraryInfo *TLI;
+
+  DenseSet<Instruction *> DeletedInstructions;
+
+  /// Checks if the instruction is marked for deletion.
+  bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
+
+  /// Removes an instruction from its block and eventually deletes it.
+  void eraseInstruction(Instruction *I) { DeletedInstructions.insert(I); }
+
+  bool processBB(BasicBlock &BB, LLVMContext &Context);
+
+  bool widenNode(ArrayRef<Instruction *> IL, LLVMContext &Context);
+
+  void widenFPTrunc(ArrayRef<Instruction *> IL);
+
+  void widenAdd(ArrayRef<Instruction *> IL, bool Reorder);
+
+  InstructionCost getOpCost(unsigned Opcode, Type *To, Type *From,
+                            Instruction *I);
+};
+} // namespace
+
+void VectorWiden::widenFPTrunc(ArrayRef<Instruction *> IL) {
+  Instruction *I = IL[0];
+  Instruction *I1 = IL[1];
+  ScalableVectorType *RetOrigType = cast<ScalableVectorType>(I->getType());
+  ScalableVectorType *OrigType =
+      cast<ScalableVectorType>(I->getOperand(0)->getType());
+  ScalableVectorType *RetType =
+      ScalableVectorType::getDoubleElementsVectorType(RetOrigType);
+  ScalableVectorType *OpType =
+      ScalableVectorType::getDoubleElementsVectorType(OrigType);
+  Value *WideVec = UndefValue::get(OpType);
+  Builder.SetInsertPoint(I);
+  Function *InsertIntr = llvm::Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::vector_insert, {OpType, OrigType});
+  Value *Insert1 = Builder.CreateCall(
+      InsertIntr, {WideVec, I->getOperand(0), Builder.getInt64(0)});
+  Value *Insert2 = Builder.CreateCall(
+      InsertIntr, {Insert1, I1->getOperand(0), Builder.getInt64(4)});
+  Value *ResFPTrunc =
+      Builder.CreateCast(Instruction::FPTrunc, Insert2, RetType);
+  Function *ExtractIntr = llvm::Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::vector_extract, {RetOrigType, RetType});
+  if (!I->users().empty()) {
+    Value *Res =
+        Builder.CreateCall(ExtractIntr, {ResFPTrunc, Builder.getInt64(4)});
+    I->replaceAllUsesWith(Res);
+  }
+  if (!I1->users().empty()) {
+    Value *Res2 =
+        Builder.CreateCall(ExtractIntr, {ResFPTrunc, Builder.getInt64(0)});
+    I1->replaceAllUsesWith(Res2);
+  }
+}
+
+void VectorWiden::widenAdd(ArrayRef<Instruction *> IL, bool Reorder) {
+  Instruction *I = IL[0];
+  Instruction *I1 = IL[1];
+
+  Value *XHi = I->getOperand(0);
+  Value *XLo = I1->getOperand(0);
+  Value *YHi = I->getOperand(1);
+  Value *YLo = I1->getOperand(1);
+  if (Reorder) {
+    std::swap(XHi, YHi);
+    std::swap(XLo, YLo);
+  }
+
+  ScalableVectorType *RetOrigType = cast<ScalableVectorType>(I->getType());
+  ScalableVectorType *OrigType =
+      cast<ScalableVectorType>(I->getOperand(0)->getType());
+  ScalableVectorType *RetType =
+      ScalableVectorType::getDoubleElementsVectorType(RetOrigType);
+  ScalableVectorType *OpType =
+      ScalableVectorType::getDoubleElementsVectorType(OrigType);
+  Value *WideVec = UndefValue::get(OpType);
+  Builder.SetInsertPoint(I);
+  Function *InsertIntr = llvm::Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::vector_insert, {OpType, OrigType});
+  Value *X1 =
+      Builder.CreateCall(InsertIntr, {WideVec, XHi, Builder.getInt64(0)});
+  Value *X2 = Builder.CreateCall(InsertIntr, {X1, XLo, Builder.getInt64(4)});
+  Value *Y1 =
+      Builder.CreateCall(InsertIntr, {WideVec, YHi, Builder.getInt64(0)});
+  Value *Y2 = Builder.CreateCall(InsertIntr, {Y1, YLo, Builder.getInt64(4)});
+  Value *ResAdd = Builder.CreateAdd(X2, Y2);
+  Function *ExtractIntr = llvm::Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::vector_extract, {RetOrigType, RetType});
+  if (!I->users().empty()) {
+    Value *Res = Builder.CreateCall(ExtractIntr, {ResAdd, Builder.getInt64(0)});
+    I->replaceAllUsesWith(Res);
+  }
+  if (!I1->users().empty()) {
+    Value *Res2 =
+        Builder.CreateCall(ExtractIntr, {ResAdd, Builder.getInt64(4)});
+    I1->replaceAllUsesWith(Res2);
+  }
+}
+
+bool VectorWiden::widenNode(ArrayRef<Instruction *> IL, LLVMContext &Context) {
+  LLVM_DEBUG(dbgs() << "VW: widenNode: " << *IL[0] << " " << *IL[1] << "\n");
+  if (!TTI.considerForWidening(Context, IL))
+    return false;
+  if (IL[0] == IL[1])
+    return false;
+  if (IL[0]->getOpcode() != IL[1]->getOpcode())
+    return false;
+  // Ignore if any live in a diffrent Basic Block
+  if (IL[0]->getParent() != IL[1]->getParent())
+    return false;
+  // Ignore if disatance between two are too apart.
+  if (abs(std::distance(IL[1]->getIterator(), IL[0]->getIterator())) >
+      MaxInstDistance)
+    return false;
+  // Check that any instrction is already deleted.
+  if (isDeleted(IL[0]) || isDeleted(IL[1]))
+    return false;
+  if (IL[1] == IL[0]->getOperand(0) || IL[0] == IL[1]->getOperand(0))
+    return false;
+  if (IL[0]->getNumOperands() > 1 &&
+      (IL[1] == IL[0]->getOperand(1) || IL[0] == IL[1]->getOperand(1)))
+    return false;
+  if (!isSafeToMoveBefore(*IL[1], *IL[0], DT, &PDT, &DI))
+    return false;
+  switch (IL[0]->getOpcode()) {
+  case Instruction::FPTrunc: {
+    ScalableVectorType *RetOrigType =
+        cast<ScalableVectorType>(IL[0]->getType());
+    ScalableVectorType *OrigType =
+        cast<ScalableVectorType>(IL[0]->getOperand(0)->getType());
+    InstructionCost Cost =
+        getOpCost(Instruction::FPTrunc, RetOrigType, OrigType, IL[0]);
+    ScalableVectorType *RetType =
+        ScalableVectorType::getDoubleElementsVectorType(RetOrigType);
+    ScalableVectorType *OpType =
+        ScalableVectorType::getDoubleElementsVectorType(OrigType);
+    InstructionCost CostNew =
+        getOpCost(Instruction::FPTrunc, RetType, OpType, IL[0]);
+    if (2 * Cost < CostNew)
+      return false;
+    LLVM_DEBUG(dbgs() << "VW: Decided to widen FPTrunc, safe to merge : "
+                      << *IL[0] << " with  " << *IL[1] << "\n");
+    widenFPTrunc(IL);
+    return true;
+  }
+  case Instruction::Add: {
+    ScalableVectorType *OrigType =
+        cast<ScalableVectorType>(IL[0]->getOperand(0)->getType());
+    ScalableVectorType *OpType =
+        ScalableVectorType::getDoubleElementsVectorType(OrigType);
+    InstructionCost Cost =
+        getOpCost(Instruction::Add, OrigType, OrigType, IL[0]);
+    InstructionCost CostNew =
+        getOpCost(Instruction::Add, OpType, OpType, IL[0]);
+    if (2 * Cost < CostNew)
+      return false;
+    LLVM_DEBUG(dbgs() << "VW: Decided to widen Add, safe to merge : " << *IL[0]
+                      << " with  " << *IL[1] << "\n");
+    widenAdd(IL, IL[0]->getOperand(1) != IL[1]->getOperand(1));
+    return true;
+  }
+
+  default:
+    break;
+  }
+  return false;
+}
+
+InstructionCost VectorWiden::getOpCost(unsigned Opcode, Type *To, Type *From,
+                                       Instruction *I) {
+  InstructionCost Cost = 0;
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  switch (Opcode) {
+  case Instruction::FPTrunc: {
+    Cost = TTI.getCastInstrCost(Opcode, To, From, TTI::getCastContextHint(I),
+                                CostKind, I);
+    break;
+  }
+  case Instruction::Add: {
+    unsigned OpIdx = isa<UnaryOperator>(I) ? 0 : 1;
+    TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(I->getOperand(0));
+    TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(I->getOperand(OpIdx));
+    SmallVector<const Value *> Operands(I->operand_values());
+    Cost = TTI.getArithmeticInstrCost(I->getOpcode(), To, CostKind, Op1Info,
+                                      Op2Info, Operands, I);
+    break;
+  }
+  default:
+    llvm_unreachable("Unknown instruction");
+  }
+  return Cost;
+}
+
+bool VectorWiden::processBB(BasicBlock &BB, LLVMContext &Context) {
+  DenseMap<unsigned, std::pair<InstrList, unsigned>> Operations;
+  unsigned Counter = 0;
+  for (BasicBlock::reverse_iterator IP(BB.rbegin()); IP != BB.rend();
+       *IP++, ++Counter) {
+    Instruction *I = &*IP;
+    unsigned OpFound = 0;
+
+    if (I->isDebugOrPseudoInst() || isDeleted(I))
+      continue;
+
+    switch (I->getOpcode()) {
+    default:
+      continue;
+    case Instruction::Add:
+    case Instruction::FPTrunc: {
+      if (Operations.find(I->getOpcode()) != Operations.end()) {
+        auto *OpRec = &Operations[I->getOpcode()];
+        // If instructions are too apart then remove old instrction
+        // and reset position to this instruction.
+        if (Counter - Operations[I->getOpcode()].second > MaxInstDistance) {
+          OpRec->second = Counter;
+          OpRec->first.clear();
+          OpRec->first.push_back(I);
+        } else {
+          OpRec->first.push_back(I);
+          OpFound = I->getOpcode();
+        }
+      } else {
+        Operations[I->getOpcode()] = {{I}, Counter};
+      }
+      break;
+    }
+    }
+    if (OpFound && Operations.find(OpFound) != Operations.end()) {
+      auto *OpRec = &Operations[OpFound];
+      for (Instruction *Op : OpRec->first)
+        LLVM_DEBUG(dbgs() << "VW Op to check : " << *Op << "\n");
+      if (!widenNode(OpRec->first, Context)) {
+        LLVM_DEBUG(dbgs() << "VW Unable to consturct the tree.\n");
+        OpRec->first.erase(OpRec->first.begin());
+        OpRec->second = Counter;
+      } else {
+        for (Instruction *Instr : OpRec->first)
+          eraseInstruction(Instr);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool VectorWiden::run() {
+  bool Changed = false;
+  LLVMContext &Context = F.getContext();
+
+  LLVM_DEBUG(dbgs() << "VW Function:" << F.getName() << "\n");
+  for (BasicBlock &BB : F) {
+    LLVM_DEBUG(dbgs() << "VW BB:" << BB.getName() << "\n");
+
+    while (processBB(BB, Context))
+      Changed = true;
+  }
+
+  if (Changed)
+    for (auto *I : DeletedInstructions)
+      RecursivelyDeleteTriviallyDeadInstructions(I);
+
+  return Changed;
+}
+
+PreservedAnalyses VectorWidenPass::run(Function &F,
+                                       FunctionAnalysisManager &FAM) {
+  TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  DependenceInfo &DI = FAM.getResult<DependenceAnalysis>(F);
----------------
sdesmalen-arm wrote:

It seems we can indeed use simpler analysis and use AA to check for any dependences between operations that access memory (which the current patch doesn't yet require, because it only supports `add` and `fptrunc`).

`DependenceInfo` is used by `llvm::isSafeToMoveBefore` and several utility functions in `CodeMoverUtils` (all in the `llvm::` namespace). The documentation for `DependenceInfo` says that it's work in progress and not yet ready to use; it would be good to add a similar comment to `CodeMoverUtils.h`.

https://github.com/llvm/llvm-project/pull/67029