[llvm] VectorWiden pass to widen aleady vectorized instrctions (PR #67029)

Wed Oct 4 02:48:25 PDT 2023

================
@@ -0,0 +1,356 @@
+///==--- VectorWiden.cpp - Combining Vector Operations to wider types ----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to widen vector operations to a wider type, it finds
+// independent from each other operations with a certain vector type as SLP does
+// with scalars by Bottom Up. It detects consecutive stores that can be put
+// together into a wider vector-stores. Next, it attempts to construct
+// vectorizable tree using the use-def chains.
+//
+//==------------------------------------------------------------------------==//
+
+#include "llvm/Transforms/Vectorize/VectorWiden.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/DependenceAnalysis.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/CodeMoverUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vector-widen"
+
+// Due to independant operations to widening that we consider with possibility
+// to merge those operations into one and also to widening store if we find
+// later store instructions. We have to consider the distance between those
+// independent operations or we might introduce bad register pressure, etc.
+
+static cl::opt<unsigned>
+    MaxInstDistance("vw-max-instr-distance", cl::init(30), cl::Hidden,
+                    cl::desc("Maximum distance between instructions to"
+                             "consider to widen"));
+
+namespace {
+class VectorWiden {
+public:
+  using InstrList = SmallVector<Instruction *, 2>;
+  using ValueList = SmallVector<Value *, 2>;
+  VectorWiden(Function &F, const TargetTransformInfo &TTI, DominatorTree &DT,
+              DependenceInfo &DI, const PostDominatorTree &PDT)
+      : F(F), Builder(F.getContext()), TTI(TTI), DT(DT), DI(DI), PDT(PDT) {}
+
+  bool run();
+
+private:
+  Function &F;
+  IRBuilder<> Builder;
+  const TargetTransformInfo &TTI;
+  DominatorTree &DT;
+  DependenceInfo &DI;
+  const PostDominatorTree &PDT;
+  TargetLibraryInfo *TLI;
+
+  DenseSet<Instruction *> DeletedInstructions;
+
+  /// Checks if the instruction is marked for deletion.
+  bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
+
+  /// Removes an instruction from its block and eventually deletes it.
+  void eraseInstruction(Instruction *I) { DeletedInstructions.insert(I); }
+
+  bool processBB(BasicBlock &BB, LLVMContext &Context);
+
+  bool widenNode(ArrayRef<Instruction *> IL, LLVMContext &Context);
+
+  void widenFPTrunc(ArrayRef<Instruction *> IL);
+
+  void widenAdd(ArrayRef<Instruction *> IL, bool Reorder);
+
+  InstructionCost getOpCost(unsigned Opcode, Type *To, Type *From,
+                            Instruction *I);
+};
+} // namespace
+
+void VectorWiden::widenFPTrunc(ArrayRef<Instruction *> IL) {
+  Instruction *I = IL[0];
+  Instruction *I1 = IL[1];
+  ScalableVectorType *RetOrigType = cast<ScalableVectorType>(I->getType());
+  ScalableVectorType *OrigType =
+      cast<ScalableVectorType>(I->getOperand(0)->getType());
+  ScalableVectorType *RetType =
+      ScalableVectorType::getDoubleElementsVectorType(RetOrigType);
+  ScalableVectorType *OpType =
+      ScalableVectorType::getDoubleElementsVectorType(OrigType);
+  Value *WideVec = UndefValue::get(OpType);
+  Builder.SetInsertPoint(I);
+  Function *InsertIntr = llvm::Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::vector_insert, {OpType, OrigType});
+  Value *Insert1 = Builder.CreateCall(
+      InsertIntr, {WideVec, I->getOperand(0), Builder.getInt64(0)});
+  Value *Insert2 = Builder.CreateCall(
+      InsertIntr, {Insert1, I1->getOperand(0), Builder.getInt64(4)});
+  Value *ResFPTrunc =
+      Builder.CreateCast(Instruction::FPTrunc, Insert2, RetType);
+  Function *ExtractIntr = llvm::Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::vector_extract, {RetOrigType, RetType});
+  if (!I->users().empty()) {
+    Value *Res =
+        Builder.CreateCall(ExtractIntr, {ResFPTrunc, Builder.getInt64(4)});
+    I->replaceAllUsesWith(Res);
+  }
+  if (!I1->users().empty()) {
+    Value *Res2 =
+        Builder.CreateCall(ExtractIntr, {ResFPTrunc, Builder.getInt64(0)});
+    I1->replaceAllUsesWith(Res2);
+  }
+}
+
+void VectorWiden::widenAdd(ArrayRef<Instruction *> IL, bool Reorder) {
+  Instruction *I = IL[0];
+  Instruction *I1 = IL[1];
+
+  Value *XHi = I->getOperand(0);
+  Value *XLo = I1->getOperand(0);
+  Value *YHi = I->getOperand(1);
+  Value *YLo = I1->getOperand(1);
+  if (Reorder) {
+    std::swap(XHi, YHi);
+    std::swap(XLo, YLo);
+  }
+
+  ScalableVectorType *RetOrigType = cast<ScalableVectorType>(I->getType());
+  ScalableVectorType *OrigType =
+      cast<ScalableVectorType>(I->getOperand(0)->getType());
+  ScalableVectorType *RetType =
+      ScalableVectorType::getDoubleElementsVectorType(RetOrigType);
+  ScalableVectorType *OpType =
+      ScalableVectorType::getDoubleElementsVectorType(OrigType);
+  Value *WideVec = UndefValue::get(OpType);
+  Builder.SetInsertPoint(I);
+  Function *InsertIntr = llvm::Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::vector_insert, {OpType, OrigType});
+  Value *X1 =
+      Builder.CreateCall(InsertIntr, {WideVec, XHi, Builder.getInt64(0)});
+  Value *X2 = Builder.CreateCall(InsertIntr, {X1, XLo, Builder.getInt64(4)});
+  Value *Y1 =
+      Builder.CreateCall(InsertIntr, {WideVec, YHi, Builder.getInt64(0)});
+  Value *Y2 = Builder.CreateCall(InsertIntr, {Y1, YLo, Builder.getInt64(4)});
+  Value *ResAdd = Builder.CreateAdd(X2, Y2);
+  Function *ExtractIntr = llvm::Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::vector_extract, {RetOrigType, RetType});
+  if (!I->users().empty()) {
+    Value *Res = Builder.CreateCall(ExtractIntr, {ResAdd, Builder.getInt64(0)});
+    I->replaceAllUsesWith(Res);
+  }
+  if (!I1->users().empty()) {
+    Value *Res2 =
+        Builder.CreateCall(ExtractIntr, {ResAdd, Builder.getInt64(4)});
+    I1->replaceAllUsesWith(Res2);
+  }
+}
+
+bool VectorWiden::widenNode(ArrayRef<Instruction *> IL, LLVMContext &Context) {
+  LLVM_DEBUG(dbgs() << "VW: widenNode: " << *IL[0] << " " << *IL[1] << "\n");
+  if (!TTI.considerForWidening(Context, IL))
+    return false;
+  if (IL[0] == IL[1])
+    return false;
+  if (IL[0]->getOpcode() != IL[1]->getOpcode())
+    return false;
+  // Ignore if any live in a diffrent Basic Block
+  if (IL[0]->getParent() != IL[1]->getParent())
+    return false;
+  // Ignore if disatance between two are too apart.
+  if (abs(std::distance(IL[1]->getIterator(), IL[0]->getIterator())) >
+      MaxInstDistance)
+    return false;
+  // Check that any instrction is already deleted.
+  if (isDeleted(IL[0]) || isDeleted(IL[1]))
+    return false;
+  if (IL[1] == IL[0]->getOperand(0) || IL[0] == IL[1]->getOperand(0))
+    return false;
+  if (IL[0]->getNumOperands() > 1 &&
+      (IL[1] == IL[0]->getOperand(1) || IL[0] == IL[1]->getOperand(1)))
+    return false;
+  if (!isSafeToMoveBefore(*IL[1], *IL[0], DT, &PDT, &DI))
----------------
sdesmalen-arm wrote:

If you have code like this:
```
  %add = add <vscale x 4 x i32> %x, %z
  ...
  %sub = sub <vscale x 4 x i32> %add, ... ; basically just any other use of `%add`
  ...
  %add2 = add <vscale x 4 x i32> %y, %z
```

Then I suspect that `%add` cannot be moved to `%add2` because there is another use of `%add` in between. This will need some kind of DAG scheduling mechanism, but I'm happy for the first iteration of this pass to only handle trivial cases and for more advanced cases like the above to be added in later iterations of this pass.

Can you instead just check for a `hasOneUse()` of the instructions? That avoids having to use much of the functionality in `isSafeToMoveBefore` (although you still need to use AliasAnalysis in case the operation you're moving is a load/store)

https://github.com/llvm/llvm-project/pull/67029