[llvm] VectorWiden pass to widen aleady vectorized instrctions (PR #67029)

Wed Nov 1 00:40:38 PDT 2023

https://github.com/dtemirbulatov updated https://github.com/llvm/llvm-project/pull/67029

>From 2633f954b5935e8cc99ab0ed8d47fbd3d6e71748 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dinar.temirbulatov at arm.com>
Date: Thu, 21 Sep 2023 13:22:46 +0000
Subject: [PATCH 1/5] This pass allows us to widen already vectorized
 instructions to wider vector types. We encountered an issue with the current
 auto-vectorisations passes that would not allow us to implement the required
 functionality without a new pass easily. For example, SME2 ADD instruction
 with a first operand and the resulting register are in a multivector form
 with scalable vector types, while the third operand is just a regular
 scalable vector type:

add     { z4.s, z5.s }, { z4.s, z5.s }, z3.s

With the loop-vectorizer pass, choosing a correct VF to deal with one of
the operands and the result to be a wider vector type could be difficult.
With the new pass, we want to consider a group of operations, not a single one
like SLP does, to make more profitable transformations, including, for example,
LOADs and  STORES, arithmetical operations, etc. For example, we could combine
those independent ADD operations in a single basic block into one on ARM SVE:
typedef int v4si __attribute__ ((vector_size (16)));

void add(v4si *ptr, v4si *ptr1) {
  v4si a = *ptr;
  ptr++;
  v4si b = *ptr;
  ptr++;
  v4si c = *ptr;
  ptr++;
  v4si d = *ptr;
  *ptr1 = a+b;
  ptr1++;
  *ptr1 = c+d;
}

On ARM SVE hardware, we could produce:
        ptrue   p0.s, vl8
        mov     x8, #8             // =0x8
        ld1w    { z0.s }, p0/z, [x0]
        ld1w    { z1.s }, p0/z, [x0, x8, lsl #2]
        add     z0.s, z1.s, z0.s
        st1w    { z0.s }, p0, [x1]
        ret

Currently, we have this output https://godbolt.org/z/z5n78TWsc:
        ldp     q0, q1, [x0]
        ldp     q2, q3, [x0, #32]
        add     v0.4s, v1.4s, v0.4s
        add     v1.4s, v3.4s, v2.4s
        stp     q0, q1, [x1]
        ret

I noticed similar opportunities with SLP vectorizer not choosing wider VF due to
its implementation, for example, with reductions only able to handle four or
fewer elements width types. Currently, the pass supports only ADD and FP_ROUND
operations to widen and only in scalable vector types.
---
 .../llvm/Analysis/TargetTransformInfo.h       |  10 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   4 +
 .../llvm/Transforms/Vectorize/VectorWiden.h   |  25 ++
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   5 +
 llvm/lib/Passes/PassBuilder.cpp               |   3 +-
 llvm/lib/Passes/PassRegistry.def              |   1 +
 .../AArch64/AArch64TargetTransformInfo.cpp    |   8 +
 .../AArch64/AArch64TargetTransformInfo.h      |  20 +
 llvm/lib/Transforms/Vectorize/CMakeLists.txt  |   1 +
 llvm/lib/Transforms/Vectorize/VectorWiden.cpp | 349 ++++++++++++++++++
 llvm/test/Transforms/VectorWiden/add.ll       |  72 ++++
 .../Transforms/VectorWiden/fptrunc-bad-dep.ll |  45 +++
 llvm/test/Transforms/VectorWiden/fptrunc.ll   |  40 ++
 .../llvm/lib/Transforms/Vectorize/BUILD.gn    |   1 +
 14 files changed, 583 insertions(+), 1 deletion(-)
 create mode 100644 llvm/include/llvm/Transforms/Vectorize/VectorWiden.h
 create mode 100644 llvm/lib/Transforms/Vectorize/VectorWiden.cpp
 create mode 100644 llvm/test/Transforms/VectorWiden/add.ll
 create mode 100644 llvm/test/Transforms/VectorWiden/fptrunc-bad-dep.ll
 create mode 100644 llvm/test/Transforms/VectorWiden/fptrunc.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 5234ef8788d9e96..2acd0117decfb22 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1672,6 +1672,9 @@ class TargetTransformInfo {
   /// \return The maximum number of function arguments the target supports.
   unsigned getMaxNumArgs() const;
 
+  /// \returns Whether vector operations are a good candidate for vector widen.
+  bool considerToWiden(LLVMContext &Context, ArrayRef<Instruction *> IL) const;
+
   /// @}
 
 private:
@@ -2041,6 +2044,8 @@ class TargetTransformInfo::Concept {
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
+  virtual bool considerToWiden(LLVMContext &Context,
+                               ArrayRef<Instruction *> IL) const = 0;
 };
 
 template <typename T>
@@ -2757,6 +2762,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   unsigned getMaxNumArgs() const override {
     return Impl.getMaxNumArgs();
   }
+
+  bool considerToWiden(LLVMContext &Context,
+                       ArrayRef<Instruction *> IL) const override {
+    return Impl.considerToWiden(Context, IL);
+  }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index c1ff314ae51c98b..5278c9d4dc8c39a 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -895,6 +895,10 @@ class TargetTransformInfoImplBase {
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
+  bool considerToWiden(LLVMContext &Context, ArrayRef<Instruction *> IL) const {
+    return false;
+  }
+
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/llvm/include/llvm/Transforms/Vectorize/VectorWiden.h b/llvm/include/llvm/Transforms/Vectorize/VectorWiden.h
new file mode 100644
index 000000000000000..6988785a92ce09c
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/VectorWiden.h
@@ -0,0 +1,25 @@
+//===--- VectorWiden.h - Combining Vector Operations to wider types ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_VECTORWIDENING_H
+#define LLVM_TRANSFORMS_VECTORIZE_VECTORWIDENING_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class VectorWidenPass : public PassInfoMixin<VectorWidenPass> {
+public:
+  VectorWidenPass() {}
+
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_VECTORWIDENING_H
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index aad14f21d114619..d01ed739a5a7152 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1248,6 +1248,11 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType,
   return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment);
 }
 
+bool TargetTransformInfo::considerToWiden(LLVMContext &Context,
+                                          ArrayRef<Instruction *> IL) const {
+  return TTIImpl->considerToWiden(Context, IL);
+}
+
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 985ff88139323c6..8ef983e5d865194 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -233,8 +233,8 @@
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
 #include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
 #include "llvm/Transforms/Utils/CountVisits.h"
-#include "llvm/Transforms/Utils/Debugify.h"
 #include "llvm/Transforms/Utils/DXILUpgrade.h"
+#include "llvm/Transforms/Utils/Debugify.h"
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
 #include "llvm/Transforms/Utils/FixIrreducible.h"
 #include "llvm/Transforms/Utils/HelloWorld.h"
@@ -263,6 +263,7 @@
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
 #include "llvm/Transforms/Vectorize/VectorCombine.h"
+#include "llvm/Transforms/Vectorize/VectorWiden.h"
 #include <optional>
 
 using namespace llvm;
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index df9f14920f29161..2eef2f0a22d95d7 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -428,6 +428,7 @@ FUNCTION_PASS("tailcallelim", TailCallElimPass())
 FUNCTION_PASS("typepromotion", TypePromotionPass(TM))
 FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
 FUNCTION_PASS("vector-combine", VectorCombinePass())
+FUNCTION_PASS("vector-widen", VectorWidenPass())
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
 FUNCTION_PASS("verify<loops>", LoopVerifierPass())
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index cded28054f59259..cecf02ee250b0c3 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2426,6 +2426,14 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
             CostKind, I));
   }
 
+  static const TypeConversionCostTblEntry SME2Tbl[] = {
+      {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 1}};
+
+  if (ST->hasSME2())
+    if (const auto *Entry = ConvertCostTableLookup(
+            SME2Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
+      return AdjustCost(Entry->Cost);
+
   if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
                                                  DstTy.getSimpleVT(),
                                                  SrcTy.getSimpleVT()))
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a6baade412c77d2..9afba1ec17ab7e6 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -412,6 +412,26 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
 
     return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy);
   }
+
+  bool considerToWiden(LLVMContext &Context, ArrayRef<Instruction *> IL) const {
+    unsigned Opcode = IL[0]->getOpcode();
+    Type *Ty = IL[0]->getType();
+    if (!ST->hasSME2())
+      return false;
+    if (llvm::any_of(IL, [Opcode, Ty](Instruction *I) {
+          return (Opcode != I->getOpcode() || Ty != I->getType());
+        }))
+      return false;
+    if (Opcode == Instruction::FPTrunc &&
+        Ty == ScalableVectorType::get(Type::getHalfTy(Context), 4))
+      return true;
+    if (Opcode == Instruction::Add &&
+        Ty == ScalableVectorType::get(Type::getInt32Ty(Context), 4) &&
+        (IL[0]->getOperand(1) == IL[1]->getOperand(1) ||
+         IL[0]->getOperand(0) == IL[1]->getOperand(0)))
+      return true;
+    return false;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 998dfd956575d3c..a1537bb1ffa632e 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_component_library(LLVMVectorize
   SLPVectorizer.cpp
   Vectorize.cpp
   VectorCombine.cpp
+  VectorWiden.cpp
   VPlan.cpp
   VPlanHCFGBuilder.cpp
   VPlanRecipes.cpp
diff --git a/llvm/lib/Transforms/Vectorize/VectorWiden.cpp b/llvm/lib/Transforms/Vectorize/VectorWiden.cpp
new file mode 100644
index 000000000000000..8a97656ab2a803c
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VectorWiden.cpp
@@ -0,0 +1,349 @@
+///==--- VectorWiden.cpp - Combining Vector Operations to wider types ----==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass tries to widen vector operations to a wider type, it finds
+// independent from each other operations with a certain vector type as SLP does
+// with scalars by Bottom Up. It detects consecutive stores that can be put
+// together into a wider vector-stores. Next, it attempts to construct
+// vectorizable tree using the use-def chains.
+//
+//==------------------------------------------------------------------------==//
+
+#include "llvm/Transforms/Vectorize/VectorWiden.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vector-widen"
+
+// Due to independant operations to widening that we consider with possibility
+// to merge those operations into one and also to widening store if we find
+// later store instructions. We have to consider the distance between those
+// independent operations or we might introduce bad register pressure, etc.
+
+static cl::opt<unsigned>
+    MaxInstDistance("vw-max-instr-distance", cl::init(30), cl::Hidden,
+                    cl::desc("Maximum distance between instructions to"
+                             "consider to widen"));
+
+namespace {
+class VectorWiden {
+public:
+  using InstrList = SmallVector<Instruction *, 2>;
+  using ValueList = SmallVector<Value *, 2>;
+  VectorWiden(Function &F, const TargetTransformInfo &TTI, DominatorTree &DT)
+      : F(F), Builder(F.getContext()), TTI(TTI), DT(DT) {}
+
+  bool run();
+
+private:
+  Function &F;
+  IRBuilder<> Builder;
+  const TargetTransformInfo &TTI;
+  DominatorTree &DT;
+  TargetLibraryInfo *TLI;
+
+  DenseSet<Instruction *> DeletedInstructions;
+
+  /// Checks if the instruction is marked for deletion.
+  bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); }
+
+  /// Removes an instruction from its block and eventually deletes it.
+  void eraseInstruction(Instruction *I) { DeletedInstructions.insert(I); }
+
+  bool processBB(BasicBlock &BB, LLVMContext &Context);
+
+  bool canWidenNode(ArrayRef<Instruction *> IL, LLVMContext &Context);
+
+  bool widenNode(ArrayRef<Instruction *> IL, LLVMContext &Context);
+
+  void widenCastInst(ArrayRef<Instruction *> IL);
+
+  void widenBinaryOperator(ArrayRef<Instruction *> IL);
+
+  InstructionCost getOpCost(unsigned Opcode, Type *To, Type *From,
+                            Instruction *I);
+};
+} // namespace
+
+void VectorWiden::widenCastInst(ArrayRef<Instruction *> IL) {
+  Instruction *I = IL[0];
+  Instruction *I1 = IL[1];
+  VectorType *RetOrigType = cast<VectorType>(I->getType());
+  VectorType *OrigType = cast<VectorType>(I->getOperand(0)->getType());
+  VectorType *RetType = VectorType::getDoubleElementsVectorType(RetOrigType);
+  VectorType *OpType = VectorType::getDoubleElementsVectorType(OrigType);
+  Value *WideVec = UndefValue::get(OpType);
+  Builder.SetInsertPoint(I);
+  Function *InsertIntr = llvm::Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::vector_insert, {OpType, OrigType});
+  Value *Insert1 = Builder.CreateCall(
+      InsertIntr, {WideVec, I->getOperand(0), Builder.getInt64(0)});
+  Value *Insert2 = Builder.CreateCall(
+      InsertIntr, {Insert1, I1->getOperand(0), Builder.getInt64(4)});
+  Value *ResCast = Builder.CreateCast(Instruction::CastOps(I->getOpcode()),
+                                      Insert2, RetType);
+  Function *ExtractIntr = llvm::Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::vector_extract, {RetOrigType, RetType});
+  if (!I->users().empty()) {
+    Value *Res =
+        Builder.CreateCall(ExtractIntr, {ResCast, Builder.getInt64(0)});
+    I->replaceAllUsesWith(Res);
+  }
+  if (!I1->users().empty()) {
+    Value *Res2 =
+        Builder.CreateCall(ExtractIntr, {ResCast, Builder.getInt64(4)});
+    I1->replaceAllUsesWith(Res2);
+  }
+}
+
+void VectorWiden::widenBinaryOperator(ArrayRef<Instruction *> IL) {
+  Instruction *I = IL[0];
+  Instruction *I1 = IL[1];
+
+  Value *XHi = I->getOperand(0);
+  Value *XLo = I1->getOperand(0);
+  Value *YHi = I->getOperand(1);
+  Value *YLo = I1->getOperand(1);
+
+  VectorType *RetOrigType = cast<VectorType>(I->getType());
+  VectorType *OrigType = cast<VectorType>(I->getOperand(0)->getType());
+  VectorType *RetType = VectorType::getDoubleElementsVectorType(RetOrigType);
+  VectorType *OpType = VectorType::getDoubleElementsVectorType(OrigType);
+  Value *WideVec = UndefValue::get(OpType);
+  Builder.SetInsertPoint(I);
+  Function *InsertIntr = llvm::Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::vector_insert, {OpType, OrigType});
+  Value *X1 =
+      Builder.CreateCall(InsertIntr, {WideVec, XLo, Builder.getInt64(0)});
+  Value *X2 = Builder.CreateCall(InsertIntr, {X1, XHi, Builder.getInt64(4)});
+  Value *Y1 =
+      Builder.CreateCall(InsertIntr, {WideVec, YLo, Builder.getInt64(0)});
+  Value *Y2 = Builder.CreateCall(InsertIntr, {Y1, YHi, Builder.getInt64(4)});
+  Value *ResBinOp =
+      Builder.CreateBinOp((Instruction::BinaryOps)I->getOpcode(), X2, Y2);
+  ValueList VL;
+  for (Instruction *I : IL)
+    VL.push_back(I);
+  propagateIRFlags(ResBinOp, VL);
+  Function *ExtractIntr = llvm::Intrinsic::getDeclaration(
+      F.getParent(), Intrinsic::vector_extract, {RetOrigType, RetType});
+  if (!I->users().empty()) {
+    Value *Res =
+        Builder.CreateCall(ExtractIntr, {ResBinOp, Builder.getInt64(4)});
+    I->replaceAllUsesWith(Res);
+  }
+  if (!I1->users().empty()) {
+    Value *Res2 =
+        Builder.CreateCall(ExtractIntr, {ResBinOp, Builder.getInt64(0)});
+    I1->replaceAllUsesWith(Res2);
+  }
+}
+
+bool VectorWiden::canWidenNode(ArrayRef<Instruction *> IL,
+                               LLVMContext &Context) {
+  if (!TTI.considerToWiden(Context, IL))
+    return false;
+
+  for (int X = 0, E = IL.size(); X < E; X++) {
+    for (int Y = 0, E = IL.size(); Y < E; Y++) {
+      if (X == Y)
+        continue;
+      if ((IL[X] == IL[Y]) || (IL[X]->getOpcode() != IL[Y]->getOpcode()) ||
+          // Ignore if any live in a diffrent Basic Block
+          (IL[X]->getParent() != IL[Y]->getParent()) ||
+          (IL[X]->getType() != IL[Y]->getType()) ||
+          (IL[X]->getOperand(0)->getType() !=
+           IL[Y]->getOperand(0)->getType()) ||
+          // Ignore if disatance between two are too apart.
+          (abs(std::distance(IL[Y]->getIterator(), IL[X]->getIterator())) >
+           MaxInstDistance) ||
+          (IL[X]->getOperand(0) == IL[Y] ||
+           (IL[X]->getNumOperands() > 1 && IL[X]->getOperand(1) == IL[Y])))
+        return false;
+    }
+    if (isDeleted(IL[X]) || !IL[X]->hasOneUse())
+      return false;
+    if (IL[0]->getParent() == IL[X]->user_back()->getParent() &&
+        DT.dominates(IL[X]->user_back(), IL[0]))
+      return false;
+  }
+  return true;
+}
+
+bool VectorWiden::widenNode(ArrayRef<Instruction *> IL, LLVMContext &Context) {
+  LLVM_DEBUG(dbgs() << "VW: widenNode: " << *IL[0] << " " << *IL[1] << "\n");
+  assert(IL.size() == 2 && "Incorrect instructions list to widen.");
+  if (!canWidenNode(IL, Context))
+    return false;
+  if (dyn_cast<CastInst>(IL[0])) {
+    VectorType *RetOrigType = cast<VectorType>(IL[0]->getType());
+    VectorType *OrigType = cast<VectorType>(IL[0]->getOperand(0)->getType());
+    InstructionCost Cost =
+        getOpCost(Instruction::FPTrunc, RetOrigType, OrigType, IL[0]);
+    VectorType *RetType = VectorType::getDoubleElementsVectorType(RetOrigType);
+    VectorType *OpType = VectorType::getDoubleElementsVectorType(OrigType);
+    InstructionCost CostNew =
+        getOpCost(Instruction::FPTrunc, RetType, OpType, IL[0]);
+    if (2 * Cost < CostNew)
+      return false;
+    LLVM_DEBUG(dbgs() << "VW: Decided to widen CastInst, safe to merge : "
+                      << *IL[0] << " with  " << *IL[1] << "\n");
+    widenCastInst(IL);
+    return true;
+  }
+  if (dyn_cast<BinaryOperator>(IL[0])) {
+    VectorType *OrigType = cast<VectorType>(IL[0]->getOperand(0)->getType());
+    VectorType *OpType = VectorType::getDoubleElementsVectorType(OrigType);
+    InstructionCost Cost =
+        getOpCost(Instruction::Add, OrigType, OrigType, IL[0]);
+    InstructionCost CostNew =
+        getOpCost(Instruction::Add, OpType, OpType, IL[0]);
+    if (2 * Cost < CostNew)
+      return false;
+    LLVM_DEBUG(dbgs() << "VW: Decided to widen BinaryOp, safe to merge : "
+                      << *IL[0] << " with  " << *IL[1] << "\n");
+    widenBinaryOperator(IL);
+    return true;
+  }
+  return false;
+}
+
+InstructionCost VectorWiden::getOpCost(unsigned Opcode, Type *To, Type *From,
+                                       Instruction *I) {
+  InstructionCost Cost = 0;
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  switch (Opcode) {
+  case Instruction::FPTrunc: {
+    Cost = TTI.getCastInstrCost(Opcode, To, From, TTI::getCastContextHint(I),
+                                CostKind, I);
+    break;
+  }
+  case Instruction::Add: {
+    unsigned OpIdx = isa<UnaryOperator>(I) ? 0 : 1;
+    TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(I->getOperand(0));
+    TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(I->getOperand(OpIdx));
+    SmallVector<const Value *> Operands(I->operand_values());
+    Cost = TTI.getArithmeticInstrCost(I->getOpcode(), To, CostKind, Op1Info,
+                                      Op2Info, Operands, I);
+    break;
+  }
+  default:
+    llvm_unreachable("Unknown instruction");
+  }
+  return Cost;
+}
+
+bool VectorWiden::processBB(BasicBlock &BB, LLVMContext &Context) {
+  DenseMap<unsigned, std::pair<InstrList, unsigned>> Operations;
+  unsigned Counter = 0;
+  for (BasicBlock::reverse_iterator IP(BB.rbegin()); IP != BB.rend();
+       *IP++, ++Counter) {
+    Instruction *I = &*IP;
+    unsigned OpFound = 0;
+
+    if (I->isDebugOrPseudoInst() || isDeleted(I))
+      continue;
+
+    unsigned Opcode = I->getOpcode();
+    if ((dyn_cast<BinaryOperator>(I) && I->getNumOperands() == 2) ||
+        (dyn_cast<CastInst>(I) &&
+         (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
+          Opcode == Instruction::FPToUI || Opcode == Instruction::FPToSI ||
+          Opcode == Instruction::FPExt || Opcode == Instruction::SIToFP ||
+          Opcode == Instruction::UIToFP || Opcode == Instruction::Trunc ||
+          Opcode == Instruction::FPTrunc || Opcode == Instruction::BitCast) &&
+         I->getNumOperands() == 1)) {
+      if (Operations.find(I->getOpcode()) != Operations.end()) {
+        auto *OpRec = &Operations[I->getOpcode()];
+        // If instructions are too apart then remove old instrction
+        // and reset position to this instruction.
+        if (Counter - Operations[I->getOpcode()].second > MaxInstDistance) {
+          OpRec->second = Counter;
+          OpRec->first.clear();
+          OpRec->first.push_back(I);
+        } else {
+          OpRec->first.push_back(I);
+          OpFound = I->getOpcode();
+        }
+      } else {
+        Operations[I->getOpcode()] = {{I}, Counter};
+      }
+    }
+
+    if (OpFound && Operations.find(OpFound) != Operations.end()) {
+      auto *OpRec = &Operations[OpFound];
+      for (Instruction *Op : OpRec->first)
+        LLVM_DEBUG(dbgs() << "VW Op to check : " << *Op << "\n");
+      if (!widenNode(OpRec->first, Context)) {
+        LLVM_DEBUG(dbgs() << "VW Unable to consturct the tree.\n");
+        OpRec->first.erase(OpRec->first.begin());
+        OpRec->second = Counter;
+      } else {
+        for (Instruction *Instr : OpRec->first)
+          eraseInstruction(Instr);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool VectorWiden::run() {
+  bool Changed = false;
+  LLVMContext &Context = F.getContext();
+
+  LLVM_DEBUG(dbgs() << "VW Function:" << F.getName() << "\n");
+  for (BasicBlock &BB : F) {
+    LLVM_DEBUG(dbgs() << "VW BB:" << BB.getName() << "\n");
+
+    while (processBB(BB, Context))
+      Changed = true;
+  }
+
+  if (Changed)
+    for (auto *I : DeletedInstructions)
+      RecursivelyDeleteTriviallyDeadInstructions(I);
+
+  return Changed;
+}
+
+PreservedAnalyses VectorWidenPass::run(Function &F,
+                                       FunctionAnalysisManager &FAM) {
+  TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+
+  VectorWiden VecWiden(F, TTI, DT);
+
+  if (!VecWiden.run())
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
diff --git a/llvm/test/Transforms/VectorWiden/add.ll b/llvm/test/Transforms/VectorWiden/add.ll
new file mode 100644
index 000000000000000..4fae437b3963180
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/add.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=vector-widen -mtriple aarch64-linux-gnu -mattr=+sme2 -S %s 2>&1 | FileCheck %s
+
+define void @add(ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @add(
+; CHECK-SAME: ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[PTR]], align 16
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[INCDEC_PTR]], align 16
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 4 x i32>, ptr [[INCDEC_PTR1]], align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> undef, <vscale x 4 x i32> [[TMP0]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP1]], i64 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> undef, <vscale x 4 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP2]], i64 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 8 x i32> [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP7]], i64 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP7]], i64 0)
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP9]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR1]], i64 1
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP8]], ptr [[INCDEC_PTR3]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <vscale x 4 x i32>, ptr %ptr, align 16
+  %incdec.ptr = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr, i64 1
+  %1 = load <vscale x 4 x i32>, ptr %incdec.ptr, align 16
+  %incdec.ptr1 = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr, i64 2
+  %2 = load <vscale x 4 x i32>, ptr %incdec.ptr1, align 16
+  %add = add <vscale x 4 x i32> %0, %2
+  %add4 = add <vscale x 4 x i32> %1, %2
+  store <vscale x 4 x i32> %add, ptr %ptr1, align 16
+  %incdec.ptr3 = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr1, i64 1
+  store <vscale x 4 x i32> %add4, ptr %incdec.ptr3, align 16
+  ret void
+}
+
+define void @add_ir_flags(ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @add_ir_flags(
+; CHECK-SAME: ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[PTR]], align 16
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[INCDEC_PTR]], align 16
+; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 4 x i32>, ptr [[INCDEC_PTR1]], align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> undef, <vscale x 4 x i32> [[TMP0]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP1]], i64 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> undef, <vscale x 4 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP2]], i64 4)
+; CHECK-NEXT:    [[TMP7:%.*]] = add nuw <vscale x 8 x i32> [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP7]], i64 4)
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP7]], i64 0)
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP9]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR1]], i64 1
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP8]], ptr [[INCDEC_PTR3]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <vscale x 4 x i32>, ptr %ptr, align 16
+  %incdec.ptr = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr, i64 1
+  %1 = load <vscale x 4 x i32>, ptr %incdec.ptr, align 16
+  %incdec.ptr1 = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr, i64 2
+  %2 = load <vscale x 4 x i32>, ptr %incdec.ptr1, align 16
+  %add = add nuw nsw <vscale x 4 x i32> %0, %2
+  %add4 = add nuw <vscale x 4 x i32> %1, %2
+  store <vscale x 4 x i32> %add, ptr %ptr1, align 16
+  %incdec.ptr3 = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr1, i64 1
+  store <vscale x 4 x i32> %add4, ptr %incdec.ptr3, align 16
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorWiden/fptrunc-bad-dep.ll b/llvm/test/Transforms/VectorWiden/fptrunc-bad-dep.ll
new file mode 100644
index 000000000000000..9f00d5ca113f4f0
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/fptrunc-bad-dep.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=vector-widen -mtriple aarch64-linux-gnu -mattr=+sme2 -S 2>&1 | FileCheck %s
+
+define void @fptrunc(ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: @fptrunc(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[PTR]], i64 [[TMP2]]
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x float>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = fptrunc <vscale x 4 x float> [[WIDE_LOAD]] to <vscale x 4 x half>
+; CHECK-NEXT:    [[EXTR:%.*]] = call <vscale x 1 x half> @llvm.vector.extract.nxv1f16.nxv4f16(<vscale x 4 x half> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[EXTEND:%.*]] = fpext <vscale x 1 x half> [[EXTR]] to <vscale x 1 x float>
+; CHECK-NEXT:    [[INS:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> [[WIDE_LOAD9]], <vscale x 1 x float> [[EXTEND]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = fptrunc <vscale x 4 x float> [[INS]] to <vscale x 4 x half>
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds half, ptr [[PTR1:%.*]], i64 0
+; CHECK-NEXT:    store <vscale x 4 x half> [[TMP6]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds half, ptr [[TMP8]], i64 [[TMP4]]
+; CHECK-NEXT:    store <vscale x 4 x half> [[TMP7]], ptr [[TMP9]], align 2
+; CHECK-NEXT:    ret void
+;
+  %1 = tail call i64 @llvm.vscale.i64()
+  %2 = shl nuw nsw i64 %1, 2
+  %3 = tail call i64 @llvm.vscale.i64()
+  %4 = shl nuw nsw i64 %3, 2
+  %wide.load = load <vscale x 4 x float>, ptr %ptr, align 4
+  %5 = getelementptr inbounds float, ptr %ptr, i64 %2
+  %wide.load9 = load <vscale x 4 x float>, ptr %5, align 4
+  %6 = fptrunc <vscale x 4 x float> %wide.load to <vscale x 4 x half>
+  %extr = call <vscale x 1 x half> @llvm.vector.extract.nxv1f16.nxv4f16(<vscale x 4 x half> %6, i64 0)
+  %extend = fpext <vscale x 1 x half> %extr to <vscale x 1 x float>
+  %ins = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %wide.load9, <vscale x 1 x float> %extend, i64 0)
+  %7 = fptrunc <vscale x 4 x float> %ins to <vscale x 4 x half>
+  %8 = getelementptr inbounds half, ptr %ptr1, i64 0
+  store <vscale x 4 x half> %6, ptr %8, align 2
+  %9 = getelementptr inbounds half, ptr %8, i64 %4
+  store <vscale x 4 x half> %7, ptr %9, align 2
+  ret void
+}
+
+declare i64 @llvm.vscale.i64()
+declare <vscale x 1 x half> @llvm.vector.extract.nxv1f16.nxv4f16(<vscale x 4 x half>, i64 immarg)
+declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float>, <vscale x 1 x float>, i64 immarg)
diff --git a/llvm/test/Transforms/VectorWiden/fptrunc.ll b/llvm/test/Transforms/VectorWiden/fptrunc.ll
new file mode 100644
index 000000000000000..838ec0b73e449a8
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/fptrunc.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=vector-widen -mtriple aarch64-linux-gnu -mattr=+sme2 -S 2>&1 | FileCheck %s
+
+define void @fptrunc(ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: @fptrunc(
+; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[PTR:%.*]], align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[PTR]], i64 [[TMP2]]
+; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x float>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> undef, <vscale x 4 x float> [[WIDE_LOAD9]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP6]], <vscale x 4 x float> [[WIDE_LOAD]], i64 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = fptrunc <vscale x 8 x float> [[TMP7]] to <vscale x 8 x half>
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x half> @llvm.vector.extract.nxv4f16.nxv8f16(<vscale x 8 x half> [[TMP8]], i64 0)
+; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x half> @llvm.vector.extract.nxv4f16.nxv8f16(<vscale x 8 x half> [[TMP8]], i64 4)
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds half, ptr [[PTR1:%.*]], i64 0
+; CHECK-NEXT:    store <vscale x 4 x half> [[TMP10]], ptr [[TMP11]], align 2
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds half, ptr [[TMP11]], i64 [[TMP4]]
+; CHECK-NEXT:    store <vscale x 4 x half> [[TMP9]], ptr [[TMP12]], align 2
+; CHECK-NEXT:    ret void
+;
+  %1 = tail call i64 @llvm.vscale.i64()
+  %2 = shl nuw nsw i64 %1, 2
+  %3 = tail call i64 @llvm.vscale.i64()
+  %4 = shl nuw nsw i64 %3, 2
+  %wide.load = load <vscale x 4 x float>, ptr %ptr, align 4
+  %5 = getelementptr inbounds float, ptr %ptr, i64 %2
+  %wide.load9 = load <vscale x 4 x float>, ptr %5, align 4
+  %6 = fptrunc <vscale x 4 x float> %wide.load to <vscale x 4 x half>
+  %7 = fptrunc <vscale x 4 x float> %wide.load9 to <vscale x 4 x half>
+  %8 = getelementptr inbounds half, ptr %ptr1, i64 0
+  store <vscale x 4 x half> %6, ptr %8, align 2
+  %9 = getelementptr inbounds half, ptr %8, i64 %4
+  store <vscale x 4 x half> %7, ptr %9, align 2
+  ret void
+}
+
+declare i64 @llvm.vscale.i64()
diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
index ca67426e08699ba..f5ef7bbd7106a55 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
@@ -19,5 +19,6 @@ static_library("Vectorize") {
     "VPlanVerifier.cpp",
     "VectorCombine.cpp",
     "Vectorize.cpp",
+    "VectorWiden.cpp",
   ]
 }

>From 2a6a2abad411df58b3a524d2ec32d56f414fffac Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dinar.temirbulatov at arm.com>
Date: Mon, 16 Oct 2023 06:29:23 +0000
Subject: [PATCH 2/5] Add support for testing with override
 TTI::considerToWiden().

---
 llvm/lib/Transforms/Vectorize/VectorWiden.cpp | 192 +++++++++++-------
 1 file changed, 118 insertions(+), 74 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorWiden.cpp b/llvm/lib/Transforms/Vectorize/VectorWiden.cpp
index 8a97656ab2a803c..d1a2a06fe4a648f 100644
--- a/llvm/lib/Transforms/Vectorize/VectorWiden.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorWiden.cpp
@@ -1,4 +1,4 @@
-///==--- VectorWiden.cpp - Combining Vector Operations to wider types ----==//
+//===--- VectorWiden.cpp - Combining Vector Operations to wider types ----===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -51,6 +51,10 @@ static cl::opt<unsigned>
                     cl::desc("Maximum distance between instructions to"
                              "consider to widen"));
 
+static cl::opt<bool> OverrideTargetConsiderToWiden(
+    "vw-override-target-consider-to-widen", cl::init(false), cl::Hidden,
+    cl::desc("Ignore any target information while considoring to widen"));
+
 namespace {
 class VectorWiden {
 public:
@@ -94,10 +98,14 @@ class VectorWiden {
 void VectorWiden::widenCastInst(ArrayRef<Instruction *> IL) {
   Instruction *I = IL[0];
   Instruction *I1 = IL[1];
-  VectorType *RetOrigType = cast<VectorType>(I->getType());
-  VectorType *OrigType = cast<VectorType>(I->getOperand(0)->getType());
-  VectorType *RetType = VectorType::getDoubleElementsVectorType(RetOrigType);
-  VectorType *OpType = VectorType::getDoubleElementsVectorType(OrigType);
+  auto *RetOrigType = cast<VectorType>(I->getType());
+  auto *OrigType = cast<VectorType>(I->getOperand(0)->getType());
+  auto *RetType = VectorType::getDoubleElementsVectorType(RetOrigType);
+  auto *OpType = VectorType::getDoubleElementsVectorType(OrigType);
+  unsigned Offset =
+      dyn_cast<ScalableVectorType>(OrigType)
+          ? (cast<ScalableVectorType>(OrigType))->getMinNumElements()
+          : (cast<FixedVectorType>(OrigType))->getNumElements();
   Value *WideVec = UndefValue::get(OpType);
   Builder.SetInsertPoint(I);
   Function *InsertIntr = llvm::Intrinsic::getDeclaration(
@@ -105,7 +113,7 @@ void VectorWiden::widenCastInst(ArrayRef<Instruction *> IL) {
   Value *Insert1 = Builder.CreateCall(
       InsertIntr, {WideVec, I->getOperand(0), Builder.getInt64(0)});
   Value *Insert2 = Builder.CreateCall(
-      InsertIntr, {Insert1, I1->getOperand(0), Builder.getInt64(4)});
+      InsertIntr, {Insert1, I1->getOperand(0), Builder.getInt64(Offset)});
   Value *ResCast = Builder.CreateCast(Instruction::CastOps(I->getOpcode()),
                                       Insert2, RetType);
   Function *ExtractIntr = llvm::Intrinsic::getDeclaration(
@@ -117,7 +125,7 @@ void VectorWiden::widenCastInst(ArrayRef<Instruction *> IL) {
   }
   if (!I1->users().empty()) {
     Value *Res2 =
-        Builder.CreateCall(ExtractIntr, {ResCast, Builder.getInt64(4)});
+        Builder.CreateCall(ExtractIntr, {ResCast, Builder.getInt64(Offset)});
     I1->replaceAllUsesWith(Res2);
   }
 }
@@ -131,20 +139,26 @@ void VectorWiden::widenBinaryOperator(ArrayRef<Instruction *> IL) {
   Value *YHi = I->getOperand(1);
   Value *YLo = I1->getOperand(1);
 
-  VectorType *RetOrigType = cast<VectorType>(I->getType());
-  VectorType *OrigType = cast<VectorType>(I->getOperand(0)->getType());
-  VectorType *RetType = VectorType::getDoubleElementsVectorType(RetOrigType);
-  VectorType *OpType = VectorType::getDoubleElementsVectorType(OrigType);
+  auto *RetOrigType = cast<VectorType>(I->getType());
+  auto *OrigType = cast<VectorType>(I->getOperand(0)->getType());
+  auto *RetType = VectorType::getDoubleElementsVectorType(RetOrigType);
+  auto *OpType = VectorType::getDoubleElementsVectorType(OrigType);
+  unsigned Offset =
+      dyn_cast<ScalableVectorType>(OrigType)
+          ? (cast<ScalableVectorType>(OrigType))->getMinNumElements()
+          : (cast<FixedVectorType>(OrigType))->getNumElements();
   Value *WideVec = UndefValue::get(OpType);
   Builder.SetInsertPoint(I);
   Function *InsertIntr = llvm::Intrinsic::getDeclaration(
       F.getParent(), Intrinsic::vector_insert, {OpType, OrigType});
   Value *X1 =
       Builder.CreateCall(InsertIntr, {WideVec, XLo, Builder.getInt64(0)});
-  Value *X2 = Builder.CreateCall(InsertIntr, {X1, XHi, Builder.getInt64(4)});
+  Value *X2 =
+      Builder.CreateCall(InsertIntr, {X1, XHi, Builder.getInt64(Offset)});
   Value *Y1 =
       Builder.CreateCall(InsertIntr, {WideVec, YLo, Builder.getInt64(0)});
-  Value *Y2 = Builder.CreateCall(InsertIntr, {Y1, YHi, Builder.getInt64(4)});
+  Value *Y2 =
+      Builder.CreateCall(InsertIntr, {Y1, YHi, Builder.getInt64(Offset)});
   Value *ResBinOp =
       Builder.CreateBinOp((Instruction::BinaryOps)I->getOpcode(), X2, Y2);
   ValueList VL;
@@ -155,7 +169,7 @@ void VectorWiden::widenBinaryOperator(ArrayRef<Instruction *> IL) {
       F.getParent(), Intrinsic::vector_extract, {RetOrigType, RetType});
   if (!I->users().empty()) {
     Value *Res =
-        Builder.CreateCall(ExtractIntr, {ResBinOp, Builder.getInt64(4)});
+        Builder.CreateCall(ExtractIntr, {ResBinOp, Builder.getInt64(Offset)});
     I->replaceAllUsesWith(Res);
   }
   if (!I1->users().empty()) {
@@ -167,7 +181,10 @@ void VectorWiden::widenBinaryOperator(ArrayRef<Instruction *> IL) {
 
 bool VectorWiden::canWidenNode(ArrayRef<Instruction *> IL,
                                LLVMContext &Context) {
-  if (!TTI.considerToWiden(Context, IL))
+  if (!OverrideTargetConsiderToWiden && !TTI.considerToWiden(Context, IL))
+    return false;
+
+  if (!dyn_cast<VectorType>(IL[0]->getType()))
     return false;
 
   for (int X = 0, E = IL.size(); X < E; X++) {
@@ -176,7 +193,6 @@ bool VectorWiden::canWidenNode(ArrayRef<Instruction *> IL,
         continue;
       if ((IL[X] == IL[Y]) || (IL[X]->getOpcode() != IL[Y]->getOpcode()) ||
           // Ignore if any live in a diffrent Basic Block
-          (IL[X]->getParent() != IL[Y]->getParent()) ||
           (IL[X]->getType() != IL[Y]->getType()) ||
           (IL[X]->getOperand(0)->getType() !=
            IL[Y]->getOperand(0)->getType()) ||
@@ -190,44 +206,53 @@ bool VectorWiden::canWidenNode(ArrayRef<Instruction *> IL,
     if (isDeleted(IL[X]) || !IL[X]->hasOneUse())
       return false;
     if (IL[0]->getParent() == IL[X]->user_back()->getParent() &&
-        DT.dominates(IL[X]->user_back(), IL[0]))
+        IL[X]->user_back()->comesBefore(IL[0]))
       return false;
   }
   return true;
 }
 
 bool VectorWiden::widenNode(ArrayRef<Instruction *> IL, LLVMContext &Context) {
-  LLVM_DEBUG(dbgs() << "VW: widenNode: " << *IL[0] << " " << *IL[1] << "\n");
-  assert(IL.size() == 2 && "Incorrect instructions list to widen.");
+  // Currently, this pass supports only two operations to widen to
+  // a single operation.
+  if (IL.size() != 2)
+    return false;
   if (!canWidenNode(IL, Context))
     return false;
+
+  unsigned Opcode = IL[0]->getOpcode();
+
   if (dyn_cast<CastInst>(IL[0])) {
-    VectorType *RetOrigType = cast<VectorType>(IL[0]->getType());
-    VectorType *OrigType = cast<VectorType>(IL[0]->getOperand(0)->getType());
-    InstructionCost Cost =
-        getOpCost(Instruction::FPTrunc, RetOrigType, OrigType, IL[0]);
-    VectorType *RetType = VectorType::getDoubleElementsVectorType(RetOrigType);
-    VectorType *OpType = VectorType::getDoubleElementsVectorType(OrigType);
-    InstructionCost CostNew =
-        getOpCost(Instruction::FPTrunc, RetType, OpType, IL[0]);
-    if (2 * Cost < CostNew)
-      return false;
-    LLVM_DEBUG(dbgs() << "VW: Decided to widen CastInst, safe to merge : "
-                      << *IL[0] << " with  " << *IL[1] << "\n");
+    if (!OverrideTargetConsiderToWiden) {
+      auto *OrigType = cast<VectorType>(IL[0]->getOperand(0)->getType());
+      auto *RetOrigType = cast<VectorType>(IL[0]->getType());
+      InstructionCost Cost = getOpCost(Opcode, RetOrigType, OrigType, IL[0]);
+      auto *RetType = VectorType::getDoubleElementsVectorType(RetOrigType);
+      auto *OpType = VectorType::getDoubleElementsVectorType(OrigType);
+      InstructionCost CostNew = getOpCost(Opcode, RetType, OpType, IL[0]);
+      if (2 * Cost < CostNew)
+        return false;
+    }
+    LLVM_DEBUG(
+        dbgs()
+        << "VW: Decided to widen CastInst, safe to merge node starting with "
+        << *IL[0] << "\n");
     widenCastInst(IL);
     return true;
   }
   if (dyn_cast<BinaryOperator>(IL[0])) {
-    VectorType *OrigType = cast<VectorType>(IL[0]->getOperand(0)->getType());
-    VectorType *OpType = VectorType::getDoubleElementsVectorType(OrigType);
-    InstructionCost Cost =
-        getOpCost(Instruction::Add, OrigType, OrigType, IL[0]);
-    InstructionCost CostNew =
-        getOpCost(Instruction::Add, OpType, OpType, IL[0]);
-    if (2 * Cost < CostNew)
-      return false;
-    LLVM_DEBUG(dbgs() << "VW: Decided to widen BinaryOp, safe to merge : "
-                      << *IL[0] << " with  " << *IL[1] << "\n");
+    if (!OverrideTargetConsiderToWiden) {
+      auto *OrigType = cast<VectorType>(IL[0]->getOperand(0)->getType());
+      auto *OpType = VectorType::getDoubleElementsVectorType(OrigType);
+      InstructionCost Cost = getOpCost(Opcode, OrigType, OrigType, IL[0]);
+      InstructionCost CostNew = getOpCost(Opcode, OpType, OpType, IL[0]);
+      if (2 * Cost < CostNew)
+        return false;
+    }
+    LLVM_DEBUG(
+        dbgs()
+        << "VW: Decided to widen BinaryOp, safe to merge node starting with "
+        << *IL[0] << "\n");
     widenBinaryOperator(IL);
     return true;
   }
@@ -259,11 +284,26 @@ InstructionCost VectorWiden::getOpCost(unsigned Opcode, Type *To, Type *From,
   return Cost;
 }
 
+static bool isOperationSupported(unsigned Opcode) {
+  if (Opcode == Instruction::FPToUI || Opcode == Instruction::FPToSI ||
+      Opcode == Instruction::FPExt || Opcode == Instruction::SIToFP ||
+      Opcode == Instruction::UIToFP || Opcode == Instruction::Trunc ||
+      Opcode == Instruction::FPTrunc || Opcode == Instruction::BitCast)
+    return true;
+  return false;
+}
+
 bool VectorWiden::processBB(BasicBlock &BB, LLVMContext &Context) {
-  DenseMap<unsigned, std::pair<InstrList, unsigned>> Operations;
-  unsigned Counter = 0;
+  struct Operation {
+    // Distance between the first operation in the list of operations
+    // and the last instruction in the current basic block.
+    unsigned Position;
+    InstrList Ops;
+  };
+  DenseMap<unsigned, Operation> Operations;
+  unsigned InstrDistance = 0;
   for (BasicBlock::reverse_iterator IP(BB.rbegin()); IP != BB.rend();
-       *IP++, ++Counter) {
+       *IP++, ++InstrDistance) {
     Instruction *I = &*IP;
     unsigned OpFound = 0;
 
@@ -271,41 +311,45 @@ bool VectorWiden::processBB(BasicBlock &BB, LLVMContext &Context) {
       continue;
 
     unsigned Opcode = I->getOpcode();
-    if ((dyn_cast<BinaryOperator>(I) && I->getNumOperands() == 2) ||
-        (dyn_cast<CastInst>(I) &&
-         (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
-          Opcode == Instruction::FPToUI || Opcode == Instruction::FPToSI ||
-          Opcode == Instruction::FPExt || Opcode == Instruction::SIToFP ||
-          Opcode == Instruction::UIToFP || Opcode == Instruction::Trunc ||
-          Opcode == Instruction::FPTrunc || Opcode == Instruction::BitCast) &&
-         I->getNumOperands() == 1)) {
-      if (Operations.find(I->getOpcode()) != Operations.end()) {
-        auto *OpRec = &Operations[I->getOpcode()];
-        // If instructions are too apart then remove old instrction
-        // and reset position to this instruction.
-        if (Counter - Operations[I->getOpcode()].second > MaxInstDistance) {
-          OpRec->second = Counter;
-          OpRec->first.clear();
-          OpRec->first.push_back(I);
-        } else {
-          OpRec->first.push_back(I);
-          OpFound = I->getOpcode();
+    if (!dyn_cast<BinaryOperator>(I) && !isOperationSupported(Opcode))
+      continue;
+
+    if (Operations.contains(I->getOpcode())) {
+      Operation *OpRec = &Operations[I->getOpcode()];
+      // If instructions are too apart then remove old instrction
+      // and reset position to this instruction.
+      if (InstrDistance - OpRec->Position > MaxInstDistance) {
+        OpRec->Ops.erase(OpRec->Ops.begin());
+
+        for (InstrList::iterator It = OpRec->Ops.begin();
+             It != OpRec->Ops.end(); ++It) {
+          Instruction *Instr = *It;
+          if (std::distance(Instr, &(*BB.end())) > MaxInstDistance)
+            OpRec->Ops.erase(It);
         }
+        // If no operations left in the list set position
+        // to the current.
+        if (!OpRec->Ops.size())
+          OpRec->Position = InstrDistance;
+        OpRec->Ops.push_back(I);
       } else {
-        Operations[I->getOpcode()] = {{I}, Counter};
+        OpRec->Ops.push_back(I);
+        OpFound = I->getOpcode();
       }
+    } else {
+      Operations[I->getOpcode()] = {InstrDistance, {I}};
     }
 
-    if (OpFound && Operations.find(OpFound) != Operations.end()) {
+    if (OpFound && Operations.contains(OpFound)) {
       auto *OpRec = &Operations[OpFound];
-      for (Instruction *Op : OpRec->first)
-        LLVM_DEBUG(dbgs() << "VW Op to check : " << *Op << "\n");
-      if (!widenNode(OpRec->first, Context)) {
-        LLVM_DEBUG(dbgs() << "VW Unable to consturct the tree.\n");
-        OpRec->first.erase(OpRec->first.begin());
-        OpRec->second = Counter;
+      for (Instruction *Op : OpRec->Ops)
+        LLVM_DEBUG(dbgs() << "VW: operation to check : " << *Op << "\n");
+      if (!widenNode(OpRec->Ops, Context)) {
+        LLVM_DEBUG(dbgs() << "VW: Unable to construct the tree.\n");
+        OpRec->Ops.erase(OpRec->Ops.begin());
+        OpRec->Position = InstrDistance;
       } else {
-        for (Instruction *Instr : OpRec->first)
+        for (Instruction *Instr : OpRec->Ops)
           eraseInstruction(Instr);
         return true;
       }
@@ -318,9 +362,9 @@ bool VectorWiden::run() {
   bool Changed = false;
   LLVMContext &Context = F.getContext();
 
-  LLVM_DEBUG(dbgs() << "VW Function:" << F.getName() << "\n");
+  LLVM_DEBUG(dbgs() << "VW: Function:" << F.getName() << "\n");
   for (BasicBlock &BB : F) {
-    LLVM_DEBUG(dbgs() << "VW BB:" << BB.getName() << "\n");
+    LLVM_DEBUG(dbgs() << "VW: BB:" << BB.getName() << "\n");
 
     while (processBB(BB, Context))
       Changed = true;

>From c4fec7303e0ebb0108898d3452a9597e4dac4c0b Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dinar.temirbulatov at arm.com>
Date: Wed, 18 Oct 2023 14:01:08 +0000
Subject: [PATCH 3/5] Fixed error vector extract and insert offset calculation
 with BitCast instruction handling.

---
 llvm/lib/Transforms/Vectorize/VectorWiden.cpp | 33 +++++++++++--------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorWiden.cpp b/llvm/lib/Transforms/Vectorize/VectorWiden.cpp
index d1a2a06fe4a648f..ebfebec3fda005a 100644
--- a/llvm/lib/Transforms/Vectorize/VectorWiden.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorWiden.cpp
@@ -24,7 +24,6 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/ConstantRange.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
@@ -60,8 +59,8 @@ class VectorWiden {
 public:
   using InstrList = SmallVector<Instruction *, 2>;
   using ValueList = SmallVector<Value *, 2>;
-  VectorWiden(Function &F, const TargetTransformInfo &TTI, DominatorTree &DT)
-      : F(F), Builder(F.getContext()), TTI(TTI), DT(DT) {}
+  VectorWiden(Function &F, const TargetTransformInfo &TTI)
+      : F(F), Builder(F.getContext()), TTI(TTI) {}
 
   bool run();
 
@@ -69,7 +68,6 @@ class VectorWiden {
   Function &F;
   IRBuilder<> Builder;
   const TargetTransformInfo &TTI;
-  DominatorTree &DT;
   TargetLibraryInfo *TLI;
 
   DenseSet<Instruction *> DeletedInstructions;
@@ -102,10 +100,17 @@ void VectorWiden::widenCastInst(ArrayRef<Instruction *> IL) {
   auto *OrigType = cast<VectorType>(I->getOperand(0)->getType());
   auto *RetType = VectorType::getDoubleElementsVectorType(RetOrigType);
   auto *OpType = VectorType::getDoubleElementsVectorType(OrigType);
+
+  bool isBitCast = I->getOpcode() == Instruction::BitCast;
   unsigned Offset =
       dyn_cast<ScalableVectorType>(OrigType)
           ? (cast<ScalableVectorType>(OrigType))->getMinNumElements()
           : (cast<FixedVectorType>(OrigType))->getNumElements();
+  unsigned BitCastOffsetExtract =
+      (dyn_cast<ScalableVectorType>(RetType)
+           ? (cast<ScalableVectorType>(RetType))->getMinNumElements()
+           : (cast<FixedVectorType>(RetType))->getNumElements()) /
+      2;
   Value *WideVec = UndefValue::get(OpType);
   Builder.SetInsertPoint(I);
   Function *InsertIntr = llvm::Intrinsic::getDeclaration(
@@ -116,6 +121,7 @@ void VectorWiden::widenCastInst(ArrayRef<Instruction *> IL) {
       InsertIntr, {Insert1, I1->getOperand(0), Builder.getInt64(Offset)});
   Value *ResCast = Builder.CreateCast(Instruction::CastOps(I->getOpcode()),
                                       Insert2, RetType);
+
   Function *ExtractIntr = llvm::Intrinsic::getDeclaration(
       F.getParent(), Intrinsic::vector_extract, {RetOrigType, RetType});
   if (!I->users().empty()) {
@@ -124,9 +130,10 @@ void VectorWiden::widenCastInst(ArrayRef<Instruction *> IL) {
     I->replaceAllUsesWith(Res);
   }
   if (!I1->users().empty()) {
-    Value *Res2 =
-        Builder.CreateCall(ExtractIntr, {ResCast, Builder.getInt64(Offset)});
-    I1->replaceAllUsesWith(Res2);
+    Value *Res = Builder.CreateCall(
+        ExtractIntr,
+        {ResCast, Builder.getInt64(isBitCast ? BitCastOffsetExtract : Offset)});
+    I1->replaceAllUsesWith(Res);
   }
 }
 
@@ -173,9 +180,9 @@ void VectorWiden::widenBinaryOperator(ArrayRef<Instruction *> IL) {
     I->replaceAllUsesWith(Res);
   }
   if (!I1->users().empty()) {
-    Value *Res2 =
+    Value *Res =
         Builder.CreateCall(ExtractIntr, {ResBinOp, Builder.getInt64(0)});
-    I1->replaceAllUsesWith(Res2);
+    I1->replaceAllUsesWith(Res);
   }
 }
 
@@ -197,8 +204,9 @@ bool VectorWiden::canWidenNode(ArrayRef<Instruction *> IL,
           (IL[X]->getOperand(0)->getType() !=
            IL[Y]->getOperand(0)->getType()) ||
           // Ignore if disatance between two are too apart.
-          (abs(std::distance(IL[Y]->getIterator(), IL[X]->getIterator())) >
-           MaxInstDistance) ||
+          (IL[Y]->comesBefore(IL[X]) &&
+           abs(std::distance(IL[Y]->getIterator(), IL[X]->getIterator())) >
+               MaxInstDistance) ||
           (IL[X]->getOperand(0) == IL[Y] ||
            (IL[X]->getNumOperands() > 1 && IL[X]->getOperand(1) == IL[Y])))
         return false;
@@ -380,9 +388,8 @@ bool VectorWiden::run() {
 PreservedAnalyses VectorWidenPass::run(Function &F,
                                        FunctionAnalysisManager &FAM) {
   TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
-  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
 
-  VectorWiden VecWiden(F, TTI, DT);
+  VectorWiden VecWiden(F, TTI);
 
   if (!VecWiden.run())
     return PreservedAnalyses::all();

>From 86dfaeb009c8621084f6e28f44cd2a14f6e0779d Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dinar.temirbulatov at arm.com>
Date: Wed, 25 Oct 2023 13:14:34 +0000
Subject: [PATCH 4/5] Resolving remarks.

---
 llvm/lib/Transforms/Vectorize/VectorWiden.cpp | 126 +++++++++++-------
 llvm/test/Transforms/VectorWiden/add.ll       |  84 +++++-------
 llvm/test/Transforms/VectorWiden/bitcast.ll   |  66 +++++++++
 llvm/test/Transforms/VectorWiden/ext-trunc.ll |  67 ++++++++++
 llvm/test/Transforms/VectorWiden/fadd.ll      |  52 ++++++++
 llvm/test/Transforms/VectorWiden/fp-ext.ll    |  24 ++++
 llvm/test/Transforms/VectorWiden/fp-int.ll    |  86 ++++++++++++
 .../Transforms/VectorWiden/fptrunc-bad-dep.ll |  44 +++---
 llvm/test/Transforms/VectorWiden/fptrunc.ll   |  42 +++---
 .../Transforms/VectorWiden/widen-distance.ll  |  59 ++++++++
 10 files changed, 496 insertions(+), 154 deletions(-)
 create mode 100644 llvm/test/Transforms/VectorWiden/bitcast.ll
 create mode 100644 llvm/test/Transforms/VectorWiden/ext-trunc.ll
 create mode 100644 llvm/test/Transforms/VectorWiden/fadd.ll
 create mode 100644 llvm/test/Transforms/VectorWiden/fp-ext.ll
 create mode 100644 llvm/test/Transforms/VectorWiden/fp-int.ll
 create mode 100644 llvm/test/Transforms/VectorWiden/widen-distance.ll

diff --git a/llvm/lib/Transforms/Vectorize/VectorWiden.cpp b/llvm/lib/Transforms/Vectorize/VectorWiden.cpp
index ebfebec3fda005a..79e0aaad1329a6e 100644
--- a/llvm/lib/Transforms/Vectorize/VectorWiden.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorWiden.cpp
@@ -171,7 +171,9 @@ void VectorWiden::widenBinaryOperator(ArrayRef<Instruction *> IL) {
   ValueList VL;
   for (Instruction *I : IL)
     VL.push_back(I);
+
   propagateIRFlags(ResBinOp, VL);
+
   Function *ExtractIntr = llvm::Intrinsic::getDeclaration(
       F.getParent(), Intrinsic::vector_extract, {RetOrigType, RetType});
   if (!I->users().empty()) {
@@ -191,9 +193,6 @@ bool VectorWiden::canWidenNode(ArrayRef<Instruction *> IL,
   if (!OverrideTargetConsiderToWiden && !TTI.considerToWiden(Context, IL))
     return false;
 
-  if (!dyn_cast<VectorType>(IL[0]->getType()))
-    return false;
-
   for (int X = 0, E = IL.size(); X < E; X++) {
     for (int Y = 0, E = IL.size(); Y < E; Y++) {
       if (X == Y)
@@ -205,7 +204,7 @@ bool VectorWiden::canWidenNode(ArrayRef<Instruction *> IL,
            IL[Y]->getOperand(0)->getType()) ||
           // Ignore if disatance between two are too apart.
           (IL[Y]->comesBefore(IL[X]) &&
-           abs(std::distance(IL[Y]->getIterator(), IL[X]->getIterator())) >
+           std::distance(IL[Y]->getIterator(), IL[X]->getIterator()) >
                MaxInstDistance) ||
           (IL[X]->getOperand(0) == IL[Y] ||
            (IL[X]->getNumOperands() > 1 && IL[X]->getOperand(1) == IL[Y])))
@@ -261,6 +260,8 @@ bool VectorWiden::widenNode(ArrayRef<Instruction *> IL, LLVMContext &Context) {
         dbgs()
         << "VW: Decided to widen BinaryOp, safe to merge node starting with "
         << *IL[0] << "\n");
+    // We want to propagate here IR flags for the group of operations like
+    // "fast" flag for float pointer ones or "nuw" for integer instructions.
     widenBinaryOperator(IL);
     return true;
   }
@@ -271,81 +272,100 @@ InstructionCost VectorWiden::getOpCost(unsigned Opcode, Type *To, Type *From,
                                        Instruction *I) {
   InstructionCost Cost = 0;
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-  switch (Opcode) {
-  case Instruction::FPTrunc: {
-    Cost = TTI.getCastInstrCost(Opcode, To, From, TTI::getCastContextHint(I),
-                                CostKind, I);
-    break;
-  }
-  case Instruction::Add: {
+  if (dyn_cast<BinaryOperator>(I)) {
     unsigned OpIdx = isa<UnaryOperator>(I) ? 0 : 1;
     TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(I->getOperand(0));
     TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(I->getOperand(OpIdx));
     SmallVector<const Value *> Operands(I->operand_values());
     Cost = TTI.getArithmeticInstrCost(I->getOpcode(), To, CostKind, Op1Info,
                                       Op2Info, Operands, I);
-    break;
-  }
-  default:
-    llvm_unreachable("Unknown instruction");
+  } else if (dyn_cast<CastInst>(I)) {
+    Cost = TTI.getCastInstrCost(Opcode, To, From, TTI::getCastContextHint(I),
+                                CostKind, I);
   }
   return Cost;
 }
 
-static bool isOperationSupported(unsigned Opcode) {
-  if (Opcode == Instruction::FPToUI || Opcode == Instruction::FPToSI ||
-      Opcode == Instruction::FPExt || Opcode == Instruction::SIToFP ||
-      Opcode == Instruction::UIToFP || Opcode == Instruction::Trunc ||
-      Opcode == Instruction::FPTrunc || Opcode == Instruction::BitCast)
+static bool isOperationSupported(Instruction *I) {
+  unsigned Opcode = I->getOpcode();
+  // Currently, we support only those operations, but later we could add more.
+  if (dyn_cast<VectorType>(I->getType()) &&
+      (I->isBinaryOp() || Opcode == Instruction::SExt ||
+       Opcode == Instruction::ZExt || Opcode == Instruction::FPToUI ||
+       Opcode == Instruction::FPToSI || Opcode == Instruction::FPExt ||
+       Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
+       Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc ||
+       Opcode == Instruction::BitCast))
     return true;
   return false;
 }
 
 bool VectorWiden::processBB(BasicBlock &BB, LLVMContext &Context) {
   struct Operation {
-    // Distance between the first operation in the list of operations
-    // and the last instruction in the current basic block.
+    // Position where the first operation, in the list of operations,
+    // was discovered and the last instruction in the current basic block.
     unsigned Position;
     InstrList Ops;
   };
+  // The key is opertion opcode.
+  // The value is a list of operations with the first operation position in
+  // the basic block.
   DenseMap<unsigned, Operation> Operations;
-  unsigned InstrDistance = 0;
+  Instruction *LastInstr = BB.getTerminator();
+  unsigned CurrentPosition = 0;
   for (BasicBlock::reverse_iterator IP(BB.rbegin()); IP != BB.rend();
-       *IP++, ++InstrDistance) {
+       *IP++, ++CurrentPosition) {
     Instruction *I = &*IP;
     unsigned OpFound = 0;
 
-    if (I->isDebugOrPseudoInst() || isDeleted(I))
+    if (I->isDebugOrPseudoInst() || isDeleted(I) || !isOperationSupported(I))
       continue;
 
     unsigned Opcode = I->getOpcode();
-    if (!dyn_cast<BinaryOperator>(I) && !isOperationSupported(Opcode))
-      continue;
-
-    if (Operations.contains(I->getOpcode())) {
-      Operation *OpRec = &Operations[I->getOpcode()];
-      // If instructions are too apart then remove old instrction
-      // and reset position to this instruction.
-      if (InstrDistance - OpRec->Position > MaxInstDistance) {
-        OpRec->Ops.erase(OpRec->Ops.begin());
-
+    if (Operations.contains(Opcode)) {
+      Operation *OpRec = &Operations[Opcode];
+      // If instructions are too apart then remove old instruction
+      // and reset position to the next instruction in the list instruction.
+      if (CurrentPosition - OpRec->Position > MaxInstDistance) {
+        unsigned NumToDelete = 0;
         for (InstrList::iterator It = OpRec->Ops.begin();
              It != OpRec->Ops.end(); ++It) {
           Instruction *Instr = *It;
-          if (std::distance(Instr, &(*BB.end())) > MaxInstDistance)
-            OpRec->Ops.erase(It);
+          unsigned NewPosition =
+              std::distance(Instr->getIterator(), LastInstr->getIterator());
+          if (CurrentPosition - NewPosition > MaxInstDistance) {
+            NumToDelete++;
+          } else {
+            // Updating Position value to next remaining in range opertion.
+            OpRec->Position = NewPosition;
+            LLVM_DEBUG(dbgs() << "VW: Updating node starting with "
+                              << **(OpRec->Ops.begin())
+                              << " position to : " << NewPosition << "\n");
+            break;
+          }
+        }
+        for (unsigned i = 0; i < NumToDelete; ++i) {
+          LLVM_DEBUG(dbgs()
+                     << "VW: Deleting operation " << **(OpRec->Ops.begin())
+                     << " from node as out of range."
+                     << "\n");
+          OpRec->Ops.erase(OpRec->Ops.begin());
         }
-        // If no operations left in the list set position
-        // to the current.
-        if (!OpRec->Ops.size())
-          OpRec->Position = InstrDistance;
-        OpRec->Ops.push_back(I);
-      } else {
-        OpRec->Ops.push_back(I);
-        OpFound = I->getOpcode();
       }
+      // If no operations left in the list, set position to the current.
+      if (!OpRec->Ops.size())
+        OpRec->Position = CurrentPosition;
+      OpRec->Ops.push_back(I);
+      LLVM_DEBUG(dbgs() << "VW: Found operation " << *I
+                        << " to add to existing node starting at "
+                        << **(OpRec->Ops.begin()) << " at : " << OpRec->Position
+                        << "\n");
+      if (OpRec->Ops.size() > 1)
+        OpFound = Opcode;
     } else {
-      Operations[I->getOpcode()] = {InstrDistance, {I}};
+      LLVM_DEBUG(dbgs() << "VW: Found operation " << *I
+                        << " to form a node at : " << CurrentPosition << "\n");
+      Operations[Opcode] = {CurrentPosition, {I}};
     }
 
     if (OpFound && Operations.contains(OpFound)) {
@@ -353,9 +373,15 @@ bool VectorWiden::processBB(BasicBlock &BB, LLVMContext &Context) {
       for (Instruction *Op : OpRec->Ops)
         LLVM_DEBUG(dbgs() << "VW: operation to check : " << *Op << "\n");
       if (!widenNode(OpRec->Ops, Context)) {
-        LLVM_DEBUG(dbgs() << "VW: Unable to construct the tree.\n");
-        OpRec->Ops.erase(OpRec->Ops.begin());
-        OpRec->Position = InstrDistance;
+        LLVM_DEBUG(dbgs() << "VW: Unable use a wider vector for vector ops.\n");
+        if (OpRec->Ops.size() > 4) {
+          LLVM_DEBUG(dbgs() << "VW: Deleting operation "
+                            << **(OpRec->Ops.begin()) << " as unable to widen."
+                            << "\n");
+          OpRec->Ops.erase(OpRec->Ops.begin());
+          OpRec->Position = std::distance(
+              (*(OpRec->Ops.begin()))->getIterator(), LastInstr->getIterator());
+        }
       } else {
         for (Instruction *Instr : OpRec->Ops)
           eraseInstruction(Instr);
@@ -374,6 +400,8 @@ bool VectorWiden::run() {
   for (BasicBlock &BB : F) {
     LLVM_DEBUG(dbgs() << "VW: BB:" << BB.getName() << "\n");
 
+    // If any transformation is done, then we have to start all over again,
+    // since we generate new instructions.
     while (processBB(BB, Context))
       Changed = true;
   }
diff --git a/llvm/test/Transforms/VectorWiden/add.ll b/llvm/test/Transforms/VectorWiden/add.ll
index 4fae437b3963180..05b2eeeb5a9c66a 100644
--- a/llvm/test/Transforms/VectorWiden/add.ll
+++ b/llvm/test/Transforms/VectorWiden/add.ll
@@ -1,72 +1,52 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -passes=vector-widen -mtriple aarch64-linux-gnu -mattr=+sme2 -S %s 2>&1 | FileCheck %s
 
-define void @add(ptr %ptr, ptr %ptr1) {
+define void @add(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, ptr %ptr) {
 ; CHECK-LABEL: define void @add(
-; CHECK-SAME: ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[PTR]], align 16
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[INCDEC_PTR]], align 16
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 4 x i32>, ptr [[INCDEC_PTR1]], align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> undef, <vscale x 4 x i32> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP1]], i64 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> undef, <vscale x 4 x i32> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP2]], i64 4)
-; CHECK-NEXT:    [[TMP7:%.*]] = add <vscale x 8 x i32> [[TMP4]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP7]], i64 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP7]], i64 0)
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP9]], ptr [[PTR1]], align 16
-; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR1]], i64 1
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP8]], ptr [[INCDEC_PTR3]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> undef, <vscale x 4 x i32> [[A]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP0]], <vscale x 4 x i32> [[B]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> undef, <vscale x 4 x i32> [[C]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[C]], i64 4)
+; CHECK-NEXT:    [[TMP4:%.*]] = add <vscale x 8 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP4]], i64 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP6]], ptr [[PTR]], align 16
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR]], i64 1
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %0 = load <vscale x 4 x i32>, ptr %ptr, align 16
-  %incdec.ptr = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr, i64 1
-  %1 = load <vscale x 4 x i32>, ptr %incdec.ptr, align 16
-  %incdec.ptr1 = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr, i64 2
-  %2 = load <vscale x 4 x i32>, ptr %incdec.ptr1, align 16
-  %add = add <vscale x 4 x i32> %0, %2
-  %add4 = add <vscale x 4 x i32> %1, %2
-  store <vscale x 4 x i32> %add, ptr %ptr1, align 16
-  %incdec.ptr3 = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr1, i64 1
+  %add = add <vscale x 4 x i32> %a, %c
+  %add4 = add <vscale x 4 x i32> %b, %c
+  store <vscale x 4 x i32> %add, ptr %ptr, align 16
+  %incdec.ptr3 = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr, i64 1
   store <vscale x 4 x i32> %add4, ptr %incdec.ptr3, align 16
   ret void
 }
 
-define void @add_ir_flags(ptr %ptr, ptr %ptr1) {
+define void @add_ir_flags(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, <vscale x 4 x i32> %c, ptr %ptr) {
 ; CHECK-LABEL: define void @add_ir_flags(
-; CHECK-SAME: ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <vscale x 4 x i32> [[A:%.*]], <vscale x 4 x i32> [[B:%.*]], <vscale x 4 x i32> [[C:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 4 x i32>, ptr [[PTR]], align 16
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load <vscale x 4 x i32>, ptr [[INCDEC_PTR]], align 16
-; CHECK-NEXT:    [[INCDEC_PTR1:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load <vscale x 4 x i32>, ptr [[INCDEC_PTR1]], align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> undef, <vscale x 4 x i32> [[TMP0]], i64 0)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP3]], <vscale x 4 x i32> [[TMP1]], i64 4)
-; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> undef, <vscale x 4 x i32> [[TMP2]], i64 0)
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP5]], <vscale x 4 x i32> [[TMP2]], i64 4)
-; CHECK-NEXT:    [[TMP7:%.*]] = add nuw <vscale x 8 x i32> [[TMP4]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP7]], i64 4)
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP7]], i64 0)
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP9]], ptr [[PTR1]], align 16
-; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR1]], i64 1
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP8]], ptr [[INCDEC_PTR3]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> undef, <vscale x 4 x i32> [[A]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP0]], <vscale x 4 x i32> [[B]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> undef, <vscale x 4 x i32> [[C]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[C]], i64 4)
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw <vscale x 8 x i32> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP4]], i64 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[TMP4]], i64 0)
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP6]], ptr [[PTR]], align 16
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR]], i64 1
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP5]], ptr [[INCDEC_PTR3]], align 16
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %0 = load <vscale x 4 x i32>, ptr %ptr, align 16
-  %incdec.ptr = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr, i64 1
-  %1 = load <vscale x 4 x i32>, ptr %incdec.ptr, align 16
-  %incdec.ptr1 = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr, i64 2
-  %2 = load <vscale x 4 x i32>, ptr %incdec.ptr1, align 16
-  %add = add nuw nsw <vscale x 4 x i32> %0, %2
-  %add4 = add nuw <vscale x 4 x i32> %1, %2
-  store <vscale x 4 x i32> %add, ptr %ptr1, align 16
-  %incdec.ptr3 = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr1, i64 1
+  %add = add nuw nsw <vscale x 4 x i32> %a, %c
+  %add4 = add nuw <vscale x 4 x i32> %b, %c
+  store <vscale x 4 x i32> %add, ptr %ptr, align 16
+  %incdec.ptr3 = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr, i64 1
   store <vscale x 4 x i32> %add4, ptr %incdec.ptr3, align 16
   ret void
 }
diff --git a/llvm/test/Transforms/VectorWiden/bitcast.ll b/llvm/test/Transforms/VectorWiden/bitcast.ll
new file mode 100644
index 000000000000000..c40780653e9810f
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/bitcast.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=vector-widen -vw-override-target-consider-to-widen=1 -S %s 2>&1 | FileCheck %s
+
+
+define void @bitcast1(<8 x i64> %a, <8 x i64> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @bitcast1(
+; CHECK-SAME: <8 x i64> [[A:%.*]], <8 x i64> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> undef, <8 x i64> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> [[TMP0]], <8 x i64> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <16 x i64> [[TMP1]] to <32 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.v32i32(<32 x i32> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.v32i32(<32 x i32> [[TMP2]], i64 16)
+; CHECK-NEXT:    store <16 x i32> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <16 x i32> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = bitcast <8 x i64> %a to <16 x i32>
+  %1 = bitcast <8 x i64> %b to <16 x i32>
+  store <16 x i32> %0, ptr %ptr, align 16
+  store <16 x i32> %1, ptr %ptr1, align 16
+  ret void
+}
+
+define void @bitcast2(<4 x i64> %a, <4 x i64> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @bitcast2(
+; CHECK-SAME: <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> undef, <4 x i64> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP0]], <4 x i64> [[A]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <64 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <32 x i8> @llvm.vector.extract.v32i8.v64i8(<64 x i8> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <32 x i8> @llvm.vector.extract.v32i8.v64i8(<64 x i8> [[TMP2]], i64 32)
+; CHECK-NEXT:    store <32 x i8> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <32 x i8> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = bitcast <4 x i64> %a to <32 x i8>
+  %1 = bitcast <4 x i64> %b to <32 x i8>
+  store <32 x i8> %0, ptr %ptr, align 16
+  store <32 x i8> %1, ptr %ptr1, align 16
+  ret void
+}
+
+define void @bitcast3(<32 x i8> %a, <32 x i8> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @bitcast3(
+; CHECK-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <64 x i8> @llvm.vector.insert.v64i8.v32i8(<64 x i8> undef, <32 x i8> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <64 x i8> @llvm.vector.insert.v64i8.v32i8(<64 x i8> [[TMP0]], <32 x i8> [[A]], i64 32)
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast <64 x i8> [[TMP1]] to <8 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.v8i64(<8 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.v8i64(<8 x i64> [[TMP2]], i64 4)
+; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = bitcast <32 x i8> %a to <4 x i64>
+  %1 = bitcast <32 x i8> %b to <4 x i64>
+  store <4 x i64> %0, ptr %ptr, align 16
+  store <4 x i64> %1, ptr %ptr1, align 16
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorWiden/ext-trunc.ll b/llvm/test/Transforms/VectorWiden/ext-trunc.ll
new file mode 100644
index 000000000000000..27db8440a499cc4
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/ext-trunc.ll
@@ -0,0 +1,67 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=vector-widen -vw-override-target-consider-to-widen=1 -S %s 2>&1 | FileCheck %s
+
+
+define void @sext(<8 x i8> %a, <8 x i8> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @sext(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> undef, <8 x i8> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP0]], <8 x i8> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = sext <16 x i8> [[TMP1]] to <16 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64> [[TMP2]], i64 8)
+; CHECK-NEXT:    store <8 x i64> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <8 x i64> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = sext <8 x i8> %a to <8 x i64>
+  %1 = sext <8 x i8> %b to <8 x i64>
+  store <8 x i64> %0, ptr %ptr, align 16
+  store <8 x i64> %1, ptr %ptr1, align 16
+  ret void
+}
+
+define void @zext(<8 x i8> %a, <8 x i8> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @zext(
+; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> undef, <8 x i8> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP0]], <8 x i8> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[TMP1]] to <16 x i64>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64> [[TMP2]], i64 8)
+; CHECK-NEXT:    store <8 x i64> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <8 x i64> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = zext <8 x i8> %a to <8 x i64>
+  %1 = zext <8 x i8> %b to <8 x i64>
+  store <8 x i64> %0, ptr %ptr, align 16
+  store <8 x i64> %1, ptr %ptr1, align 16
+  ret void
+}
+
+define void @trunc(<8 x i64> %a, <8 x i64> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @trunc(
+; CHECK-SAME: <8 x i64> [[A:%.*]], <8 x i64> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> undef, <8 x i64> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> [[TMP0]], <8 x i64> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc <16 x i64> [[TMP1]] to <16 x i8>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[TMP2]], i64 8)
+; CHECK-NEXT:    store <8 x i8> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <8 x i8> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = trunc <8 x i64> %a to <8 x i8>
+  %1 = trunc <8 x i64> %b to <8 x i8>
+  store <8 x i8> %0, ptr %ptr, align 16
+  store <8 x i8> %1, ptr %ptr1, align 16
+  ret void
+}
+
diff --git a/llvm/test/Transforms/VectorWiden/fadd.ll b/llvm/test/Transforms/VectorWiden/fadd.ll
new file mode 100644
index 000000000000000..01070ba824c3e8c
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/fadd.ll
@@ -0,0 +1,52 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=vector-widen -vw-override-target-consider-to-widen=1 -S %s 2>&1 | FileCheck %s
+
+define void @add(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, ptr %ptr) {
+; CHECK-LABEL: define void @add(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> undef, <vscale x 4 x float> [[A]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP0]], <vscale x 4 x float> [[B]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> undef, <vscale x 4 x float> [[C]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[C]], i64 4)
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <vscale x 8 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[TMP4]], i64 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[TMP4]], i64 0)
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP6]], ptr [[PTR]], align 16
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds <vscale x 4 x float>, ptr [[PTR]], i64 1
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = fadd <vscale x 4 x float> %a, %c
+  %add4 = fadd <vscale x 4 x float> %b, %c
+  store <vscale x 4 x float> %add, ptr %ptr, align 16
+  %incdec.ptr3 = getelementptr inbounds <vscale x 4 x float>, ptr %ptr, i64 1
+  store <vscale x 4 x float> %add4, ptr %incdec.ptr3, align 16
+  ret void
+}
+
+define void @add_ir_flags(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, ptr %ptr) {
+; CHECK-LABEL: define void @add_ir_flags(
+; CHECK-SAME: <vscale x 4 x float> [[A:%.*]], <vscale x 4 x float> [[B:%.*]], <vscale x 4 x float> [[C:%.*]], ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> undef, <vscale x 4 x float> [[A]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP0]], <vscale x 4 x float> [[B]], i64 4)
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> undef, <vscale x 4 x float> [[C]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[C]], i64 4)
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd nnan ninf <vscale x 8 x float> [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[TMP4]], i64 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[TMP4]], i64 0)
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP6]], ptr [[PTR]], align 16
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds <vscale x 4 x float>, ptr [[PTR]], i64 1
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP5]], ptr [[INCDEC_PTR3]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = fadd fast nnan <vscale x 4 x float> %a, %c
+  %add4 = fadd nnan ninf <vscale x 4 x float> %b, %c
+  store <vscale x 4 x float> %add, ptr %ptr, align 16
+  %incdec.ptr3 = getelementptr inbounds <vscale x 4 x float>, ptr %ptr, i64 1
+  store <vscale x 4 x float> %add4, ptr %incdec.ptr3, align 16
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorWiden/fp-ext.ll b/llvm/test/Transforms/VectorWiden/fp-ext.ll
new file mode 100644
index 000000000000000..a4fadd8dbaabf30
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/fp-ext.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=vector-widen -vw-override-target-consider-to-widen=1 -S %s 2>&1 | FileCheck %s
+
+
+define void @fp_ext(<8 x half> %a, <8 x half> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @fp_ext(
+; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> undef, <8 x half> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> [[TMP0]], <8 x half> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext <16 x half> [[TMP1]] to <16 x double>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x double> @llvm.vector.extract.v8f64.v16f64(<16 x double> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x double> @llvm.vector.extract.v8f64.v16f64(<16 x double> [[TMP2]], i64 8)
+; CHECK-NEXT:    store <8 x double> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <8 x double> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = fpext <8 x half> %a to <8 x double>
+  %1 = fpext <8 x half> %b to <8 x double>
+  store <8 x double> %0, ptr %ptr, align 16
+  store <8 x double> %1, ptr %ptr1, align 16
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorWiden/fp-int.ll b/llvm/test/Transforms/VectorWiden/fp-int.ll
new file mode 100644
index 000000000000000..1e1ebd2939b36b1
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/fp-int.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
+; RUN: opt -passes=vector-widen -vw-override-target-consider-to-widen=1 -S %s 2>&1 | FileCheck %s
+
+define void @fptosi(<8 x half> %a, <8 x half> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @fptosi(
+; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> undef, <8 x half> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> [[TMP0]], <8 x half> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <16 x half> [[TMP1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> [[TMP2]], i64 8)
+; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <8 x i16> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = fptosi <8 x half> %a to <8 x i16>
+  %1 = fptosi <8 x half> %b to <8 x i16>
+  store <8 x i16> %0, ptr %ptr, align 16
+  store <8 x i16> %1, ptr %ptr1, align 16
+  ret void
+}
+
+define void @sitofp(<8 x i16> %a, <8 x i16> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @sitofp(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> undef, <8 x i16> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP0]], <8 x i16> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <16 x i16> [[TMP1]] to <16 x half>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> [[TMP2]], i64 8)
+; CHECK-NEXT:    store <8 x half> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = sitofp <8 x i16> %a to <8 x half>
+  %1 = sitofp <8 x i16> %b to <8 x half>
+  store <8 x half> %0, ptr %ptr, align 16
+  store <8 x half> %1, ptr %ptr1, align 16
+  ret void
+}
+
+define void @fptoui(<8 x half> %a, <8 x half> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @fptoui(
+; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> undef, <8 x half> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> [[TMP0]], <8 x half> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = fptoui <16 x half> [[TMP1]] to <16 x i16>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> [[TMP2]], i64 8)
+; CHECK-NEXT:    store <8 x i16> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <8 x i16> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = fptoui <8 x half> %a to <8 x i16>
+  %1 = fptoui <8 x half> %b to <8 x i16>
+  store <8 x i16> %0, ptr %ptr, align 16
+  store <8 x i16> %1, ptr %ptr1, align 16
+  ret void
+}
+
+define void @uitofp(<8 x i16> %a, <8 x i16> %b, ptr %ptr, ptr %ptr1) {
+; CHECK-LABEL: define void @uitofp(
+; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> undef, <8 x i16> [[B]], i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP0]], <8 x i16> [[A]], i64 8)
+; CHECK-NEXT:    [[TMP2:%.*]] = uitofp <16 x i16> [[TMP1]] to <16 x half>
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> [[TMP2]], i64 8)
+; CHECK-NEXT:    store <8 x half> [[TMP4]], ptr [[PTR]], align 16
+; CHECK-NEXT:    store <8 x half> [[TMP3]], ptr [[PTR1]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = uitofp <8 x i16> %a to <8 x half>
+  %1 = uitofp <8 x i16> %b to <8 x half>
+  store <8 x half> %0, ptr %ptr, align 16
+  store <8 x half> %1, ptr %ptr1, align 16
+  ret void
+}
diff --git a/llvm/test/Transforms/VectorWiden/fptrunc-bad-dep.ll b/llvm/test/Transforms/VectorWiden/fptrunc-bad-dep.ll
index 9f00d5ca113f4f0..4db4efd676985be 100644
--- a/llvm/test/Transforms/VectorWiden/fptrunc-bad-dep.ll
+++ b/llvm/test/Transforms/VectorWiden/fptrunc-bad-dep.ll
@@ -1,42 +1,32 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=vector-widen -mtriple aarch64-linux-gnu -mattr=+sme2 -S 2>&1 | FileCheck %s
 
-define void @fptrunc(ptr %ptr, ptr %ptr1) {
+define void @fptrunc(<vscale x 4 x float> %a, <vscale x 4 x float> %b, ptr %ptr) {
 ; CHECK-LABEL: @fptrunc(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[PTR]], i64 [[TMP2]]
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x float>, ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = fptrunc <vscale x 4 x float> [[WIDE_LOAD]] to <vscale x 4 x half>
-; CHECK-NEXT:    [[EXTR:%.*]] = call <vscale x 1 x half> @llvm.vector.extract.nxv1f16.nxv4f16(<vscale x 4 x half> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = fptrunc <vscale x 4 x float> [[A:%.*]] to <vscale x 4 x half>
+; CHECK-NEXT:    [[EXTR:%.*]] = call <vscale x 1 x half> @llvm.vector.extract.nxv1f16.nxv4f16(<vscale x 4 x half> [[TMP3]], i64 0)
 ; CHECK-NEXT:    [[EXTEND:%.*]] = fpext <vscale x 1 x half> [[EXTR]] to <vscale x 1 x float>
-; CHECK-NEXT:    [[INS:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> [[WIDE_LOAD9]], <vscale x 1 x float> [[EXTEND]], i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = fptrunc <vscale x 4 x float> [[INS]] to <vscale x 4 x half>
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds half, ptr [[PTR1:%.*]], i64 0
-; CHECK-NEXT:    store <vscale x 4 x half> [[TMP6]], ptr [[TMP8]], align 2
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds half, ptr [[TMP8]], i64 [[TMP4]]
-; CHECK-NEXT:    store <vscale x 4 x half> [[TMP7]], ptr [[TMP9]], align 2
+; CHECK-NEXT:    [[INS:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> [[B:%.*]], <vscale x 1 x float> [[EXTEND]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = fptrunc <vscale x 4 x float> [[INS]] to <vscale x 4 x half>
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds half, ptr [[PTR:%.*]], i64 0
+; CHECK-NEXT:    store <vscale x 4 x half> [[TMP3]], ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds half, ptr [[TMP5]], i64 [[TMP2]]
+; CHECK-NEXT:    store <vscale x 4 x half> [[TMP4]], ptr [[TMP6]], align 2
 ; CHECK-NEXT:    ret void
 ;
   %1 = tail call i64 @llvm.vscale.i64()
   %2 = shl nuw nsw i64 %1, 2
-  %3 = tail call i64 @llvm.vscale.i64()
-  %4 = shl nuw nsw i64 %3, 2
-  %wide.load = load <vscale x 4 x float>, ptr %ptr, align 4
-  %5 = getelementptr inbounds float, ptr %ptr, i64 %2
-  %wide.load9 = load <vscale x 4 x float>, ptr %5, align 4
-  %6 = fptrunc <vscale x 4 x float> %wide.load to <vscale x 4 x half>
-  %extr = call <vscale x 1 x half> @llvm.vector.extract.nxv1f16.nxv4f16(<vscale x 4 x half> %6, i64 0)
+  %3 = fptrunc <vscale x 4 x float> %a to <vscale x 4 x half>
+  %extr = call <vscale x 1 x half> @llvm.vector.extract.nxv1f16.nxv4f16(<vscale x 4 x half> %3, i64 0)
   %extend = fpext <vscale x 1 x half> %extr to <vscale x 1 x float>
-  %ins = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %wide.load9, <vscale x 1 x float> %extend, i64 0)
-  %7 = fptrunc <vscale x 4 x float> %ins to <vscale x 4 x half>
-  %8 = getelementptr inbounds half, ptr %ptr1, i64 0
-  store <vscale x 4 x half> %6, ptr %8, align 2
-  %9 = getelementptr inbounds half, ptr %8, i64 %4
-  store <vscale x 4 x half> %7, ptr %9, align 2
+  %ins = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float> %b, <vscale x 1 x float> %extend, i64 0)
+  %4 = fptrunc <vscale x 4 x float> %ins to <vscale x 4 x half>
+  %5 = getelementptr inbounds half, ptr %ptr, i64 0
+  store <vscale x 4 x half> %3, ptr %5, align 2
+  %6 = getelementptr inbounds half, ptr %5, i64 %2
+  store <vscale x 4 x half> %4, ptr %6, align 2
   ret void
 }
 
diff --git a/llvm/test/Transforms/VectorWiden/fptrunc.ll b/llvm/test/Transforms/VectorWiden/fptrunc.ll
index 838ec0b73e449a8..4c19abf852eade4 100644
--- a/llvm/test/Transforms/VectorWiden/fptrunc.ll
+++ b/llvm/test/Transforms/VectorWiden/fptrunc.ll
@@ -1,39 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=vector-widen -mtriple aarch64-linux-gnu -mattr=+sme2 -S 2>&1 | FileCheck %s
 
-define void @fptrunc(ptr %ptr, ptr %ptr1) {
+define void @fptrunc(<vscale x 4 x float> %a, <vscale x 4 x float> %b, ptr %ptr) {
 ; CHECK-LABEL: @fptrunc(
 ; CHECK-NEXT:    [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[PTR:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[PTR]], i64 [[TMP2]]
-; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 4 x float>, ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> undef, <vscale x 4 x float> [[WIDE_LOAD9]], i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP6]], <vscale x 4 x float> [[WIDE_LOAD]], i64 4)
-; CHECK-NEXT:    [[TMP8:%.*]] = fptrunc <vscale x 8 x float> [[TMP7]] to <vscale x 8 x half>
-; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x half> @llvm.vector.extract.nxv4f16.nxv8f16(<vscale x 8 x half> [[TMP8]], i64 0)
-; CHECK-NEXT:    [[TMP10:%.*]] = call <vscale x 4 x half> @llvm.vector.extract.nxv4f16.nxv8f16(<vscale x 8 x half> [[TMP8]], i64 4)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds half, ptr [[PTR1:%.*]], i64 0
-; CHECK-NEXT:    store <vscale x 4 x half> [[TMP10]], ptr [[TMP11]], align 2
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds half, ptr [[TMP11]], i64 [[TMP4]]
-; CHECK-NEXT:    store <vscale x 4 x half> [[TMP9]], ptr [[TMP12]], align 2
+; CHECK-NEXT:    [[TMP3:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> undef, <vscale x 4 x float> [[B:%.*]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP3]], <vscale x 4 x float> [[A:%.*]], i64 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = fptrunc <vscale x 8 x float> [[TMP4]] to <vscale x 8 x half>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x half> @llvm.vector.extract.nxv4f16.nxv8f16(<vscale x 8 x half> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x half> @llvm.vector.extract.nxv4f16.nxv8f16(<vscale x 8 x half> [[TMP5]], i64 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds half, ptr [[PTR:%.*]], i64 0
+; CHECK-NEXT:    store <vscale x 4 x half> [[TMP7]], ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds half, ptr [[TMP8]], i64 [[TMP2]]
+; CHECK-NEXT:    store <vscale x 4 x half> [[TMP6]], ptr [[TMP9]], align 2
 ; CHECK-NEXT:    ret void
 ;
   %1 = tail call i64 @llvm.vscale.i64()
   %2 = shl nuw nsw i64 %1, 2
-  %3 = tail call i64 @llvm.vscale.i64()
-  %4 = shl nuw nsw i64 %3, 2
-  %wide.load = load <vscale x 4 x float>, ptr %ptr, align 4
-  %5 = getelementptr inbounds float, ptr %ptr, i64 %2
-  %wide.load9 = load <vscale x 4 x float>, ptr %5, align 4
-  %6 = fptrunc <vscale x 4 x float> %wide.load to <vscale x 4 x half>
-  %7 = fptrunc <vscale x 4 x float> %wide.load9 to <vscale x 4 x half>
-  %8 = getelementptr inbounds half, ptr %ptr1, i64 0
-  store <vscale x 4 x half> %6, ptr %8, align 2
-  %9 = getelementptr inbounds half, ptr %8, i64 %4
-  store <vscale x 4 x half> %7, ptr %9, align 2
+  %3 = fptrunc <vscale x 4 x float> %a to <vscale x 4 x half>
+  %4 = fptrunc <vscale x 4 x float> %b to <vscale x 4 x half>
+  %5 = getelementptr inbounds half, ptr %ptr, i64 0
+  store <vscale x 4 x half> %3, ptr %5, align 2
+  %6 = getelementptr inbounds half, ptr %5, i64 %2
+  store <vscale x 4 x half> %4, ptr %6, align 2
   ret void
 }
 
diff --git a/llvm/test/Transforms/VectorWiden/widen-distance.ll b/llvm/test/Transforms/VectorWiden/widen-distance.ll
new file mode 100644
index 000000000000000..bc6d0682f847528
--- /dev/null
+++ b/llvm/test/Transforms/VectorWiden/widen-distance.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=vector-widen -S -vw-override-target-consider-to-widen=1 -vw-max-instr-distance=2 < %s | FileCheck %s
+
+define <4 x i32> @foo(float %a0, float %a1, float %a2, float %a3, ptr %ptr1, ptr %ptr2) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTR1:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[PTR1]], align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> undef, <4 x float> [[TMP2]], i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP3]], <4 x float> [[TMP1]], i64 4)
+; CHECK-NEXT:    [[TMP5:%.*]] = fptosi <8 x float> [[TMP4]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP5]], i64 0)
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP5]], i64 4)
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> poison, float [[A0:%.*]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[A1:%.*]], i32 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[A2:%.*]], i32 2
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[A3:%.*]], i32 3
+; CHECK-NEXT:    [[TMP12:%.*]] = fptosi <4 x float> [[TMP11]] to <4 x i32>
+; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[PTR1]], align 2
+; CHECK-NEXT:    store <4 x i32> [[TMP12]], ptr [[PTR2:%.*]], align 2
+; CHECK-NEXT:    ret <4 x i32> [[TMP7]]
+;
+  %1 = load <4 x float>, ptr %ptr1
+  %2 = fptosi <4 x float> %1 to <4 x i32>
+  %3 = load <4 x float>, ptr %ptr1
+  %4 = fptosi <4 x float> %3 to <4 x i32>
+  %5 = insertelement <4 x float> poison, float %a0, i32 0
+  %6 = insertelement <4 x float> %5, float %a1, i32 1
+  %7 = insertelement <4 x float> %6, float %a2, i32 2
+  %8 = insertelement <4 x float> %7, float %a3, i32 3
+  %9 = fptosi <4 x float> %8 to <4 x i32>
+  store <4 x i32> %4, ptr %ptr1, align 2
+  store <4 x i32> %9, ptr %ptr2, align 2
+  ret <4 x i32> %2
+}
+
+define <4 x i32> @bar(<4 x float> %a0, ptr %ptr1, ptr %ptr2) {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[PTR1:%.*]], align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x float>, ptr [[PTR1]], align 16
+; CHECK-NEXT:    [[TMP4:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> undef, <4 x float> [[A0:%.*]], i64 0)
+; CHECK-NEXT:    [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP4]], <4 x float> [[TMP3]], i64 4)
+; CHECK-NEXT:    [[TMP6:%.*]] = fptosi <8 x float> [[TMP5]] to <8 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP6]], i64 0)
+; CHECK-NEXT:    [[TMP8:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP6]], i64 4)
+; CHECK-NEXT:    store <4 x i32> [[TMP8]], ptr [[PTR1]], align 2
+; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[PTR2:%.*]], align 2
+; CHECK-NEXT:    ret <4 x i32> [[TMP2]]
+;
+
+  %1 = load <4 x float>, ptr %ptr1
+  %2 = fptosi <4 x float> %1 to <4 x i32>
+  %3 = load <4 x float>, ptr %ptr1
+  %4 = fptosi <4 x float> %3 to <4 x i32>
+  %5 = fptosi <4 x float> %a0 to <4 x i32>
+  store <4 x i32> %4, ptr %ptr1, align 2
+  store <4 x i32> %5, ptr %ptr2, align 2
+  ret <4 x i32> %2
+}

>From ed23e6433829c57771f790d9bdc480b8b2027dfc Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <dinar.temirbulatov at arm.com>
Date: Wed, 1 Nov 2023 07:37:46 +0000
Subject: [PATCH 5/5] Reduced this double-nested loop in
 VectorWiden::canWidenNode, added testcase for bad dependace case.

---
 llvm/lib/Transforms/Vectorize/VectorWiden.cpp | 23 ++++++++++---------
 .../{fptrunc-bad-dep.ll => bad-dependace.ll}  | 19 +++++++++++++++
 2 files changed, 31 insertions(+), 11 deletions(-)
 rename llvm/test/Transforms/VectorWiden/{fptrunc-bad-dep.ll => bad-dependace.ll} (73%)

diff --git a/llvm/lib/Transforms/Vectorize/VectorWiden.cpp b/llvm/lib/Transforms/Vectorize/VectorWiden.cpp
index 79e0aaad1329a6e..2b7e7eaa77840ae 100644
--- a/llvm/lib/Transforms/Vectorize/VectorWiden.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorWiden.cpp
@@ -193,25 +193,26 @@ bool VectorWiden::canWidenNode(ArrayRef<Instruction *> IL,
   if (!OverrideTargetConsiderToWiden && !TTI.considerToWiden(Context, IL))
     return false;
 
+  bool HasSecondOperand = IL[0]->getNumOperands() > 1;
   for (int X = 0, E = IL.size(); X < E; X++) {
     for (int Y = 0, E = IL.size(); Y < E; Y++) {
       if (X == Y)
         continue;
-      if ((IL[X] == IL[Y]) || (IL[X]->getOpcode() != IL[Y]->getOpcode()) ||
-          // Ignore if any live in a diffrent Basic Block
-          (IL[X]->getType() != IL[Y]->getType()) ||
-          (IL[X]->getOperand(0)->getType() !=
-           IL[Y]->getOperand(0)->getType()) ||
-          // Ignore if disatance between two are too apart.
-          (IL[Y]->comesBefore(IL[X]) &&
-           std::distance(IL[Y]->getIterator(), IL[X]->getIterator()) >
-               MaxInstDistance) ||
-          (IL[X]->getOperand(0) == IL[Y] ||
-           (IL[X]->getNumOperands() > 1 && IL[X]->getOperand(1) == IL[Y])))
+      if (IL[X] == IL[Y] || IL[X]->getOperand(0) == IL[Y] ||
+          (HasSecondOperand && IL[X]->getOperand(1) == IL[Y]))
         return false;
     }
     if (isDeleted(IL[X]) || !IL[X]->hasOneUse())
       return false;
+    if (X == 0)
+      continue;
+    if (IL[X]->getOpcode() != IL[X - 1]->getOpcode() ||
+        // Ignore if any types are different.
+        IL[X]->getType() != IL[X - 1]->getType() ||
+        IL[X]->getOperand(0)->getType() !=
+            IL[X - 1]->getOperand(0)->getType() ||
+        IL[X - 1]->comesBefore(IL[X]))
+      return false;
     if (IL[0]->getParent() == IL[X]->user_back()->getParent() &&
         IL[X]->user_back()->comesBefore(IL[0]))
       return false;
diff --git a/llvm/test/Transforms/VectorWiden/fptrunc-bad-dep.ll b/llvm/test/Transforms/VectorWiden/bad-dependace.ll
similarity index 73%
rename from llvm/test/Transforms/VectorWiden/fptrunc-bad-dep.ll
rename to llvm/test/Transforms/VectorWiden/bad-dependace.ll
index 4db4efd676985be..9e901037dcd2fd4 100644
--- a/llvm/test/Transforms/VectorWiden/fptrunc-bad-dep.ll
+++ b/llvm/test/Transforms/VectorWiden/bad-dependace.ll
@@ -30,6 +30,25 @@ define void @fptrunc(<vscale x 4 x float> %a, <vscale x 4 x float> %b, ptr %ptr)
   ret void
 }
 
+define void @add(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, ptr %ptr) {
+; CHECK-LABEL: @add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 4 x i32> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[ADD4:%.*]] = add <vscale x 4 x i32> [[ADD]], [[B]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[ADD]], ptr [[PTR:%.*]], align 16
+; CHECK-NEXT:    [[INCDEC_PTR3:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[PTR]], i64 1
+; CHECK-NEXT:    store <vscale x 4 x i32> [[ADD4]], ptr [[INCDEC_PTR3]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %add = add <vscale x 4 x i32> %a, %b
+  %add4 = add <vscale x 4 x i32> %add, %b
+  store <vscale x 4 x i32> %add, ptr %ptr, align 16
+  %incdec.ptr3 = getelementptr inbounds <vscale x 4 x i32>, ptr %ptr, i64 1
+  store <vscale x 4 x i32> %add4, ptr %incdec.ptr3, align 16
+  ret void
+}
+
 declare i64 @llvm.vscale.i64()
 declare <vscale x 1 x half> @llvm.vector.extract.nxv1f16.nxv4f16(<vscale x 4 x half>, i64 immarg)
 declare <vscale x 4 x float> @llvm.vector.insert.nxv4f32.nxv1f32(<vscale x 4 x float>, <vscale x 1 x float>, i64 immarg)