[llvm] [AArch64] Generate DOT instructions from matching IR (PR #69583)

Thu Oct 19 03:14:45 PDT 2023

https://github.com/huntergr-arm created https://github.com/llvm/llvm-project/pull/69583

This pass matches sequences of extend->mul->accumulate and replaces
them with DOT intrinsics.

This currently only supports SVE and scalable vectors.


>From 2b6da683e001ba852674d0f55cc5beb95c14782f Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter at arm.com>
Date: Fri, 13 Oct 2023 14:09:55 +0100
Subject: [PATCH] [AArch64] Generate DOT instructions from matching IR

This pass matches sequences of extend->mul->accumulate and replaces
them with DOT intrinsics.

This currently only supports SVE and scalable vectors.
---
 llvm/lib/Target/AArch64/AArch64.h             |   2 +
 .../Target/AArch64/AArch64DotProdMatcher.cpp  | 486 +++++++++++++
 .../Target/AArch64/AArch64TargetMachine.cpp   |  11 +
 llvm/lib/Target/AArch64/CMakeLists.txt        |   1 +
 llvm/test/CodeGen/AArch64/O3-pipeline.ll      |   1 +
 llvm/test/CodeGen/AArch64/dotprodmatch.ll     | 684 ++++++++++++++++++
 6 files changed, 1185 insertions(+)
 create mode 100644 llvm/lib/Target/AArch64/AArch64DotProdMatcher.cpp
 create mode 100644 llvm/test/CodeGen/AArch64/dotprodmatch.ll

diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index 901769c54b6ef59..afdc8e3698b2d99 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -71,6 +71,7 @@ FunctionPass *createAArch64PostSelectOptimize();
 FunctionPass *createAArch64StackTaggingPass(bool IsOptNone);
 FunctionPass *createAArch64StackTaggingPreRAPass();
 ModulePass *createAArch64GlobalsTaggingPass();
+FunctionPass *createAArch64DotProdMatcherPass();
 
 void initializeAArch64A53Fix835769Pass(PassRegistry&);
 void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
@@ -108,6 +109,7 @@ void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
 void initializeLDTLSCleanupPass(PassRegistry&);
 void initializeSMEABIPass(PassRegistry &);
 void initializeSVEIntrinsicOptsPass(PassRegistry &);
+void initializeAArch64DotProdMatcherPass(PassRegistry &);
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/AArch64DotProdMatcher.cpp b/llvm/lib/Target/AArch64/AArch64DotProdMatcher.cpp
new file mode 100644
index 000000000000000..44215efee75c33c
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64DotProdMatcher.cpp
@@ -0,0 +1,486 @@
+//===- AArch64DotProdMatcher - Matches instruction sequences to *DOT ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass recognizes and transforms IR to make use of two relatively simple
+// cases that can be implemented by the SDOT and UDOT instructions on AArch64
+// in order to increase vector unit bandwidth.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstructionCost.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "Utils/AArch64BaseInfo.h"
+#include <deque>
+#include <optional>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "aarch64-dot-product-matcher"
+
+#define DOT_ACCUMULATOR_DEPTH (4)
+
+STATISTIC(NumDOTInstrs, "Number of DOT Instructions generated.");
+STATISTIC(NumSimpleDOTReplacements, "Num of simple dot patterns replaced.");
+STATISTIC(NumLoopDOTReplacements, "Num of loop dot patterns replaced.");
+
+struct LoopAccumulate {
+  Value *RVal;
+  PHINode *Phi;
+  Value *IterVals;
+  Value *Predicate;
+  Value *Mul;
+  Value *ValA;
+  Value *ValB;
+  VectorType *VTy;
+  Type *AccTy;
+  BasicBlock *LoopBlock;
+  BasicBlock *PHBlock;
+  bool IsSExt;
+
+  LoopAccumulate(Value *RVal, PHINode *Phi, Value *IterVals, Value *Predicate,
+                 Value *Mul, Value *ValA, Value *ValB, VectorType *VTy,
+                 Type *AccTy, BasicBlock *LoopBlock, BasicBlock *PHBlock,
+                 bool IsSExt)
+    : RVal(RVal), Phi(Phi), IterVals(IterVals), Predicate(Predicate),
+    Mul(Mul), ValA(ValA), ValB(ValB), VTy(VTy), AccTy(AccTy), LoopBlock(LoopBlock),
+    PHBlock(PHBlock), IsSExt(IsSExt) {}
+};
+
+// Returns true if the instruction in question is an vector integer add
+// reduction intrinsic.
+static bool isScalableIntegerSumReduction(Instruction &I) {
+  auto *II = dyn_cast<IntrinsicInst>(&I);
+  return II &&
+         II->getIntrinsicID() == Intrinsic::vector_reduce_add &&
+         isa<ScalableVectorType>(II->getOperand(0)->getType());
+}
+
+// Returns a vector type for a dot product accumulator if the element type and
+// extended element type are suitable, or a nullptr if not.
+static Type *getAccumulatorType(Type *EltTy, Type *ExtEltTy, ElementCount EC) {
+  Type *AccEltTy = nullptr;
+  if (EltTy->isIntegerTy(8) && ExtEltTy->getPrimitiveSizeInBits() <= 32)
+    AccEltTy = Type::getInt32Ty(EltTy->getContext());
+  else if (EltTy->isIntegerTy(16) && ExtEltTy->getPrimitiveSizeInBits() <= 64)
+    AccEltTy = Type::getInt64Ty(EltTy->getContext());
+
+  if (AccEltTy)
+    return VectorType::get(AccEltTy, EC);
+
+  return nullptr;
+}
+
+// Returns either a pair of basic block pointers corresponding to the expected
+// two incoming values for the phi, or None if one of the checks failed.
+static std::optional<std::pair<BasicBlock*, BasicBlock*>>
+getPHIIncomingBlocks(PHINode *Phi) {
+  // Check PHI; we're only expecting the incoming value from within the loop
+  // and one incoming value from a preheader.
+  if (Phi->getNumIncomingValues() != 2)
+    return std::nullopt;
+
+  BasicBlock *PHBlock = Phi->getIncomingBlock(0);
+  BasicBlock *LoopBlock = Phi->getIncomingBlock(1);
+  // If this isn't a loop, or if it's a loop with multiple blocks, we bail
+  // out for now. If needed we can improve this pass later.
+  if (Phi->getParent() != LoopBlock && Phi->getParent() != PHBlock)
+    return std::nullopt;
+
+  // Make sure we know which incoming value belongs to the loop
+  if (PHBlock == Phi->getParent())
+    std::swap(LoopBlock, PHBlock);
+
+  // If there's a non-null incoming value from the preheader, bail out for now.
+  // We may be able to do better in future.
+  Constant *Const = dyn_cast<Constant>(Phi->getIncomingValueForBlock(PHBlock));
+  if (LoopBlock != Phi->getParent() || !Const || !Const->isNullValue())
+    return std::nullopt;
+
+  return std::make_pair(LoopBlock, PHBlock);
+}
+
+static bool checkLoopAcc(Value *RVal, PHINode *OldPHI, Value *IterVals,
+                         SmallVectorImpl<LoopAccumulate> &Accumulators) {
+  // Check a possible loop accumulator.
+  bool IsSExt = false;
+
+  // We only expect the add in the loop to be used by the reduction and by
+  // the PHI node.
+  if (!RVal->hasNUses(2) || !is_contained(OldPHI->incoming_values(), RVal)) {
+    LLVM_DEBUG(dbgs() << "Loop sum operation has more than two uses or isn't "
+                         "used by the accumulating PHI node.\n");
+    return false;
+  }
+
+  // Look through selects with zeroinitializer. Record the predicate so
+  // we can insert selects for the base values later.
+  Value *Predicate = nullptr, *Mul = nullptr;
+  if (!match(IterVals, m_Select(m_Value(Predicate), m_Value(Mul), m_Zero())))
+    Mul = IterVals;
+
+  Value *ValA = nullptr, *ValB = nullptr;
+  // Match the core pattern of element-wise multiplication of extended values.
+  if (match(Mul, m_OneUse(m_Mul(m_SExt(m_OneUse(m_Value(ValA))),
+                                m_SExt(m_OneUse(m_Value(ValB)))))))
+    IsSExt = true;
+  else if (!match(Mul, m_OneUse(m_Mul(m_ZExt(m_OneUse(m_Value(ValA))),
+                                      m_ZExt(m_OneUse(m_Value(ValB))))))) {
+    LLVM_DEBUG(dbgs() << "Couldn't match inner loop multiply: "
+                      << *Mul << "\n");
+    return false;
+  }
+
+  // The same extended value could be used for both operands of the multiply,
+  // so we just need to check that they have a single user.
+  Instruction *I = dyn_cast<Instruction>(Mul);
+  if (!I->getOperand(0)->hasOneUser() || !I->getOperand(1)->hasOneUser())
+    return false;
+
+  // Check that the vector type is one packed vector's worth of data.
+  // TODO: Do we want to allow multiples?
+  VectorType *ValTy = cast<VectorType>(ValA->getType());
+  if (ValTy->getPrimitiveSizeInBits().getKnownMinValue() !=
+      AArch64::SVEBitsPerBlock) {
+    LLVM_DEBUG(dbgs() << "Vector base size is not a packed representation.\n");
+    return false;
+  }
+
+  // Find the accumulator element type after extension and check that it isn't
+  // too large; if it is, we might lose data by converting to dot instructions.
+  // The element count needs to be 1/4th that of the input data, since the
+  // dot product instructions take four smaller elements and multiply/accumulate
+  // them into one larger element.
+  Type *AccTy = getAccumulatorType(ValTy->getElementType(),
+      Mul->getType()->getScalarType(),
+      ValTy->getElementCount().divideCoefficientBy(4));
+
+  if (!AccTy) {
+    LLVM_DEBUG(dbgs() << "Accumulator element type too wide.\n");
+    return false;
+  }
+
+  // Validate the phi node and retrieve the incoming basic blocks for the
+  // accumulating loop itself and the preheader.
+  auto PhiBlocks = getPHIIncomingBlocks(OldPHI);
+
+  if (!PhiBlocks) {
+    LLVM_DEBUG(dbgs() << "Unable to match PHI node\n");
+    return false;
+  }
+
+  // Everything looks in order, so add it to the list of accumulators to
+  // transform.
+  Accumulators.emplace_back(RVal, OldPHI, IterVals, Predicate, Mul, ValA,
+                            ValB, ValTy, AccTy, PhiBlocks->first,
+                            PhiBlocks->second, IsSExt);
+  return true;
+}
+
+static bool findDOTAccumulatorsInLoop(Value *RVal,
+                                SmallVectorImpl<LoopAccumulate> &Accumulators,
+                                unsigned Depth = DOT_ACCUMULATOR_DEPTH) {
+  // Don't recurse too far.
+  if (Depth == 0)
+    return false;
+
+  Value *V1 = nullptr, *V2 = nullptr;
+
+  // Try to match the expected pattern from a sum reduction in
+  // a vectorized loop.
+  if (match(RVal, m_Add(m_Value(V1), m_Value(V2)))) {
+    if (isa<PHINode>(V1) && !isa<PHINode>(V2) &&
+        V1->hasOneUse() && V2->hasOneUse())
+      return checkLoopAcc(RVal, cast<PHINode>(V1), V2, Accumulators);
+
+    if (!isa<PHINode>(V1) && isa<PHINode>(V2) &&
+        V1->hasOneUse() && V2->hasOneUse())
+      return checkLoopAcc(RVal, cast<PHINode>(V2), V1, Accumulators);
+
+    // Otherwise assume this is an intermediate multi-register reduction
+    // and recurse to the operands.
+    return findDOTAccumulatorsInLoop(V1, Accumulators, Depth - 1) &&
+           findDOTAccumulatorsInLoop(V2, Accumulators, Depth - 1);
+  }
+
+  return false;
+}
+
+namespace {
+
+class AArch64DotProdMatcher : public FunctionPass {
+public:
+  static char ID;
+  AArch64DotProdMatcher() : FunctionPass(ID) {
+    initializeAArch64DotProdMatcherPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+    bool Changed = false;
+    SmallVector<Instruction *, 4> Reductions;
+    for (BasicBlock &Block : F)
+      // TODO: Support non-scalable dot instructions too.
+      for (Instruction &I : make_filter_range(Block,
+                                              isScalableIntegerSumReduction))
+        Reductions.push_back(&I);
+
+    for (auto *Rdx : Reductions)
+      Changed |= trySimpleDotReplacement(*Rdx) || tryLoopDotReplacement(*Rdx);
+
+    return Changed;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+  TargetTransformInfo *TTI;
+
+private:
+  bool trySimpleDotReplacement(Instruction &I);
+  bool tryLoopDotReplacement(Instruction &I);
+};
+
+} // end anonymous namespace
+
+char AArch64DotProdMatcher::ID = 0;
+INITIALIZE_PASS_BEGIN(AArch64DotProdMatcher, DEBUG_TYPE,
+                "AArch64 Dot Product Instruction Matcher", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_END(AArch64DotProdMatcher, DEBUG_TYPE,
+                "AArch64 Dot Product Instruction Matcher", false, false)
+
+FunctionPass *llvm::createAArch64DotProdMatcherPass() {
+  return new AArch64DotProdMatcher();
+}
+
+// The following method looks for a simple pattern of two values being either
+// sign or zero extended, multiplied together, then summed. If the types
+// match the ones used by the [s|u]dot instructions (groups of 4x8 -> 32,
+// groups of 4x16 -> 64) then we can replace the extends and multiply with a
+// dot instruction and swap the reduce for one using fewer elements.
+//
+//      +-----------+   +-----------+
+//      |   ValA    |   |   ValB    |
+//      +-----+-----+   +-----+-----+
+//            |               |
+//            |               |
+//      +-----v-----+   +-----v-----+
+//      | [S|Z]Ext  |   | [S|Z]Ext  |
+//      +-----+-----+   +-----+-----+
+//            |               |
+//            +--+         +--+
+//               |         |
+//              +v---------v+
+//              |    Mul    |
+//              +-----+-----+
+//                    |
+//                    |
+//              +-----v-----+
+//              | Reduce(+) |
+//              +-----------+
+bool AArch64DotProdMatcher::trySimpleDotReplacement(Instruction &I) {
+  LLVM_DEBUG(dbgs() << "Looking for simple dot reduction: " << I << "\n");
+  Value *RVal = I.getOperand(0);
+  Value *ValA = nullptr, *ValB = nullptr;
+  bool IsSExt = false;
+
+  if (match(RVal, m_Mul(m_SExt(m_Value(ValA)), m_SExt(m_Value(ValB)))))
+    IsSExt = true;
+  else if (!match(RVal, m_Mul(m_ZExt(m_Value(ValA)), m_ZExt(m_Value(ValB))))) {
+    LLVM_DEBUG(dbgs() << "Unable to match simple dot pattern\n");
+    return false;
+  }
+
+  VectorType *ATy = cast<VectorType>(ValA->getType());
+  VectorType *BTy = cast<VectorType>(ValB->getType());
+  VectorType *MTy = cast<VectorType>(RVal->getType());
+  if (ATy != BTy || !((ATy->getScalarType()->isIntegerTy(8) &&
+                       MTy->getScalarType()->isIntegerTy(32)) ||
+                      (ATy->getScalarType()->isIntegerTy(16) &&
+                       MTy->getScalarType()->isIntegerTy(64)))) {
+    LLVM_DEBUG(dbgs() << "Unable to match types for simple dot pattern\n");
+    return false;
+  }
+
+  if (TTI->getRegisterBitWidth(TargetTransformInfo::RGK_ScalableVector) !=
+      ATy->getPrimitiveSizeInBits())
+    return false;
+
+  // All conditions met, proceed with replacement.
+  IRBuilder<> Builder(cast<Instruction>(RVal));
+
+  // Need a new accumulator type.
+  Type *AccTy = VectorType::get(MTy->getScalarType(),
+                                MTy->getElementCount().divideCoefficientBy(4));
+  Value *Zeroes = ConstantAggregateZero::get(AccTy);
+
+  Intrinsic::ID IntID = IsSExt ? Intrinsic::aarch64_sve_sdot :
+                                 Intrinsic::aarch64_sve_udot;
+  Value *DotProd = Builder.CreateIntrinsic(IntID, {AccTy},
+                                           {Zeroes, ValA, ValB});
+  Builder.SetInsertPoint(&I);
+  Value *Reduce = Builder.CreateAddReduce(DotProd);
+  I.replaceAllUsesWith(Reduce);
+  NumDOTInstrs++;
+  NumSimpleDOTReplacements++;
+  return true;
+}
+
+// This method looks for the following pattern: It starts from a sum
+// reduction, but expects to find a vector add operation inside a loop with one
+// of the operands being a PHI. The other operand can either be a select
+// between zeroes and a multiply, or just the multiply directly. The rest of
+// the pattern is the same as the simpler case -- multiply of extends of some
+// values.
+//
+// Replacing this is a little tricky, since we need to replace the PHI node
+// and accumulator as well, and potentially add in new selects earlier, but if
+// everything checks out then the extend -> multiply -> inner loop add operation
+// is replaced by the [s|u]dot instruction.
+//
+//                                     +-----------+
+//                                     |   Zero    |
+//                                     +-+---------+
+//  +-------+      +---------------------+   |
+//  |       |      |                         |
+//  |    +--v------v-+                       |
+//  |    |  OldPHI   |                       |
+//  |    +--+--------+                       |
+//  |       |                                |
+//  |       |   +-----------+   +-----------+|
+//  |       |   |   ValA    |   |   ValB    ||
+//  |       |   +-----+-----+   +-----+-----+|
+//  |       |         |               |      |
+//  |       |         |               |      |
+//  |       |   +-----v-----+   +-----v-----+|
+//  |       |   | [S|Z]Ext  |   | [S|Z]Ext  ||
+//  |       |   +-----+-----+   +-----+-----+|
+//  |       |         |               |      |
+//  |       |         +--+         +--+      |
+//  |       |            |         |         |
+//  |       |           +v---------v+        |
+//  |       |           |    Mul    |        |
+//  |       |           +-+---------+        |
+//  |       |             |       +----------+
+//  |       |             |       |
+//  |       |           +-v-------v-+
+//  |       |           |  Select   |
+//  |       |           +--+--------+
+//  |       |              |
+//  |       |              |
+//  |       |              |
+//  |    +--v--------------v---+
+//  |    |         Add         |
+//  |    +--+-------+----------+
+//  |       |       |
+//  +-------+       |
+//                  |
+//            +-----v-----+
+//            | Reduce(+) |
+//            +-----------+
+bool AArch64DotProdMatcher::tryLoopDotReplacement(Instruction &I) {
+  LLVM_DEBUG(dbgs() << "Looking for Loop DOT Reduction: " << I << "\n");
+  Value *RVal = I.getOperand(0);
+  SmallVector<LoopAccumulate, 4> Accumulators;
+  std::deque<Value *> RdxVals;
+  IRBuilder<> Builder(&I);
+
+  // If the loop was interleaved, we may have some intermediate add
+  // instructions first before we get to the accumulators inside the
+  // loop. Gather those first then process them.
+  if (!findDOTAccumulatorsInLoop(RVal, Accumulators)) {
+    LLVM_DEBUG(dbgs() << "Couldn't find DOT accumulators in the loop\n");
+    return false;
+  }
+
+  // All conditions met, proceed with replacement.
+  for (auto &Acc : Accumulators) {
+    Builder.SetInsertPoint(Acc.Phi);
+
+    // Plant new PHI node.
+    PHINode *DotAcc = Builder.CreatePHI(Acc.AccTy, 2, "dot.accumulate");
+    Value *Zeroes = ConstantAggregateZero::get(Acc.AccTy);
+    DotAcc->addIncoming(Zeroes, Acc.PHBlock);
+
+    // Move to the dot insertion point.
+    Builder.SetInsertPoint(cast<Instruction>(Acc.RVal));
+
+    // Need to generate selects for ValA and ValB if there was one before the
+    // accumulate before.
+    // Hopefully we can fold away some extra selects (e.g. if the data originally
+    // came from masked loads with the same predicate).
+    if (Acc.Predicate) {
+      Value *Zeroes = ConstantAggregateZero::get(Acc.VTy);
+      Acc.ValA = Builder.CreateSelect(Acc.Predicate, Acc.ValA, Zeroes);
+      Acc.ValB = Builder.CreateSelect(Acc.Predicate, Acc.ValB, Zeroes);
+    }
+
+    // Now plant the dot instruction.
+    Intrinsic::ID IntID = Acc.IsSExt ? Intrinsic::aarch64_sve_sdot :
+                                          Intrinsic::aarch64_sve_udot;
+    Value *DotProd = Builder.CreateIntrinsic(IntID, {Acc.AccTy},
+                                             {DotAcc, Acc.ValA, Acc.ValB});
+    DotAcc->addIncoming(DotProd, Acc.LoopBlock);
+
+    RdxVals.push_back(DotProd);
+
+    NumDOTInstrs++;
+  }
+
+  assert(!RdxVals.empty() &&
+         "We found accumulators but generated no RdxVals");
+
+
+  Builder.SetInsertPoint(cast<Instruction>(RVal));
+
+  while (RdxVals.size() > 1) {
+    RdxVals.push_back(Builder.CreateAdd(RdxVals[0], RdxVals[1]));
+    // Drop the two RdxVals we just reduced. Sadly, there's no SmallDeque
+    // with a pop_front_val() convenience method yet.
+    RdxVals.pop_front();
+    RdxVals.pop_front();
+  }
+
+  // Plant new reduction.
+  Builder.SetInsertPoint(&I);
+  Value *Reduce = Builder.CreateAddReduce(RdxVals.front());
+  Value *Trunc = Builder.CreateTrunc(Reduce, I.getType(), "dot.trunc");
+  I.replaceAllUsesWith(Trunc);
+
+
+  // Delete the original reduction, since it's no longer required
+  RecursivelyDeleteTriviallyDeadInstructions(&I);
+  NumLoopDOTReplacements++;
+  return true;
+}
+
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 3d818c76bd4b7d7..4a76d2f705a5a13 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -165,6 +165,11 @@ static cl::opt<bool>
                            cl::desc("Enable SVE intrinsic opts"),
                            cl::init(true));
 
+static cl::opt<bool>
+EnableAArch64DotProdMatch("aarch64-enable-dotprodmatch", cl::Hidden,
+                          cl::desc("Enable matching dot product instructions"),
+                          cl::init(true));
+
 static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
                                          cl::init(true), cl::Hidden);
 
@@ -246,6 +251,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
   initializeAArch64LowerHomogeneousPrologEpilogPass(*PR);
   initializeAArch64DAGToDAGISelPass(*PR);
   initializeAArch64GlobalsTaggingPass(*PR);
+  initializeAArch64DotProdMatcherPass(*PR);
 }
 
 //===----------------------------------------------------------------------===//
@@ -553,6 +559,11 @@ void AArch64PassConfig::addIRPasses() {
   // ourselves.
   addPass(createAtomicExpandPass());
 
+  // Make use of SVE intrinsics in place of common vector operations that span
+  // multiple basic blocks.
+  if (TM->getOptLevel() != CodeGenOptLevel::None && EnableAArch64DotProdMatch)
+    addPass(createAArch64DotProdMatcherPass());
+
   // Expand any SVE vector library calls that we can't code generate directly.
   if (EnableSVEIntrinsicOpts &&
       TM->getOptLevel() == CodeGenOptLevel::Aggressive)
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index d97342b0829d826..b89ce94b9312277 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -50,6 +50,7 @@ add_llvm_target(AArch64CodeGen
   AArch64CondBrTuning.cpp
   AArch64ConditionalCompares.cpp
   AArch64DeadRegisterDefinitionsPass.cpp
+  AArch64DotProdMatcher.cpp
   AArch64ExpandImm.cpp
   AArch64ExpandPseudoInsts.cpp
   AArch64FalkorHWPFFix.cpp
diff --git a/llvm/test/CodeGen/AArch64/O3-pipeline.ll b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
index f5c1c3c291cb585..7d196b8579d202b 100644
--- a/llvm/test/CodeGen/AArch64/O3-pipeline.ll
+++ b/llvm/test/CodeGen/AArch64/O3-pipeline.ll
@@ -22,6 +22,7 @@
 ; CHECK-NEXT:       Expand large div/rem
 ; CHECK-NEXT:       Expand large fp convert
 ; CHECK-NEXT:       Expand Atomic instructions
+; CHECK-NEXT:       AArch64 Dot Product Instruction Matcher
 ; CHECK-NEXT:     SVE intrinsics optimizations
 ; CHECK-NEXT:       FunctionPass Manager
 ; CHECK-NEXT:         Dominator Tree Construction
diff --git a/llvm/test/CodeGen/AArch64/dotprodmatch.ll b/llvm/test/CodeGen/AArch64/dotprodmatch.ll
new file mode 100644
index 000000000000000..a75048351b81030
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dotprodmatch.ll
@@ -0,0 +1,684 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -aarch64-dot-product-matcher -instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define i16 @sve_sdot_loop_i16_to_i32(ptr readonly %a, ptr readonly %b, i32 %N) #0 {
+; CHECK-LABEL: define i16 @sve_sdot_loop_i16_to_i32
+; CHECK-SAME: (ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[MIN_ITERS_CHECKED:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       min.iters.checked:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[PREDICATE_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[MIN_ITERS_CHECKED]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[PREDICATE:%.*]] = phi <vscale x 8 x i1> [ [[PREDICATE_ENTRY]], [[MIN_ITERS_CHECKED]] ], [ [[PREDICATE_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOT_ACCUMULATE:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[MIN_ITERS_CHECKED]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP0]], i32 2, <vscale x 8 x i1> [[PREDICATE]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD19:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP1]], i32 2, <vscale x 8 x i1> [[PREDICATE]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP2]] = call <vscale x 2 x i64> @llvm.aarch64.sve.sdot.nxv2i64(<vscale x 2 x i64> [[DOT_ACCUMULATE]], <vscale x 8 x i16> [[WIDE_MASKED_LOAD19]], <vscale x 8 x i16> [[WIDE_MASKED_LOAD]])
+; CHECK-NEXT:    [[VS:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[VS_SCALED:%.*]] = shl i64 [[VS]], 3
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[VS_SCALED]]
+; CHECK-NEXT:    [[PREDICATE_NEXT]] = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <vscale x 8 x i1> [[PREDICATE_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[PHITMP201:%.*]] = lshr i64 [[TMP4]], 16
+; CHECK-NEXT:    [[PHITMP:%.*]] = trunc i64 [[PHITMP201]] to i16
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[ACC_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[PHITMP]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i16 [[ACC_0_LCSSA]]
+;
+entry:
+  %cmp11 = icmp sgt i32 %N, 0
+  br i1 %cmp11, label %min.iters.checked, label %for.cond.cleanup
+
+min.iters.checked:                                ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  %wide.end.idx.splatinsert = insertelement <vscale x 8 x i64> undef, i64 %wide.trip.count, i32 0
+  %wide.end.idx.splat = shufflevector <vscale x 8 x i64> %wide.end.idx.splatinsert, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+  %predicate.entry = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i64(i64 0, i64 %wide.trip.count)
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %min.iters.checked
+  %index = phi i64 [ 0, %min.iters.checked ], [ %index.next, %vector.body ]
+  %predicate = phi <vscale x 8 x i1> [ %predicate.entry, %min.iters.checked ], [ %predicate.next, %vector.body ]
+  %vec.phi = phi <vscale x 8 x i32> [ zeroinitializer, %min.iters.checked ], [ %6, %vector.body ]
+  %0 = getelementptr inbounds i16, ptr %a, i64 %index
+  %wide.masked.load = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %0, i32 2, <vscale x 8 x i1> %predicate, <vscale x 8 x i16> undef)
+  %1 = sext <vscale x 8 x i16> %wide.masked.load to <vscale x 8 x i32>
+  %2 = getelementptr inbounds i16, ptr %b, i64 %index
+  %wide.masked.load19 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %2, i32 2, <vscale x 8 x i1> %predicate, <vscale x 8 x i16> undef)
+  %3 = sext <vscale x 8 x i16> %wide.masked.load19 to <vscale x 8 x i32>
+  %4 = mul nsw <vscale x 8 x i32> %3, %1
+  %5 = select <vscale x 8 x i1> %predicate, <vscale x 8 x i32> %4, <vscale x 8 x i32> zeroinitializer
+  %6 = add nsw <vscale x 8 x i32> %vec.phi, %5
+  %vs = call i64 @llvm.vscale.i64()
+  %vs.scaled = mul i64 %vs, 8
+  %index.next = add nuw i64 %index, %vs.scaled
+  %.splatinsert = insertelement <vscale x 8 x i64> undef, i64 %index.next, i32 0
+  %.splat = shufflevector <vscale x 8 x i64> %.splatinsert, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+  %predicate.next = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %7 = extractelement <vscale x 8 x i1> %predicate.next, i64 0
+  br i1 %7, label %vector.body, label %middle.block
+
+middle.block:                                     ; preds = %vector.body
+  %8 = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> %6)
+  %phitmp20 = lshr i32 %8, 16
+  %phitmp = trunc i32 %phitmp20 to i16
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %acc.0.lcssa = phi i16 [ 0, %entry ], [ %phitmp, %middle.block ]
+  ret i16 %acc.0.lcssa
+}
+
+define dso_local i16 @sve_sdot_loop_i16_to_i32_interleavedx2_scalartail(ptr readonly %a, ptr readonly %b, i32 %N) #0 {
+; CHECK-LABEL: define dso_local i16 @sve_sdot_loop_i16_to_i32_interleavedx2_scalartail
+; CHECK-SAME: (ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER17:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i32 [[TMP2]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOT_ACCUMULATE1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOT_ACCUMULATE:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i16>, ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i64 [[TMP4]]
+; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 8 x i16>, ptr [[TMP6]], align 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD15:%.*]] = load <vscale x 8 x i16>, ptr [[TMP7]], align 2
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP7]], i64 [[TMP4]]
+; CHECK-NEXT:    [[WIDE_LOAD16:%.*]] = load <vscale x 8 x i16>, ptr [[TMP8]], align 2
+; CHECK-NEXT:    [[TMP9]] = call <vscale x 2 x i64> @llvm.aarch64.sve.sdot.nxv2i64(<vscale x 2 x i64> [[DOT_ACCUMULATE1]], <vscale x 8 x i16> [[WIDE_LOAD15]], <vscale x 8 x i16> [[WIDE_LOAD]])
+; CHECK-NEXT:    [[TMP10]] = call <vscale x 2 x i64> @llvm.aarch64.sve.sdot.nxv2i64(<vscale x 2 x i64> [[DOT_ACCUMULATE]], <vscale x 8 x i16> [[WIDE_LOAD16]], <vscale x 8 x i16> [[WIDE_LOAD14]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 2 x i64> [[TMP10]], [[TMP9]]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP12]])
+; CHECK-NEXT:    [[DOT_TRUNC:%.*]] = trunc i64 [[TMP13]] to i32
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[EXTRACT4:%.*]] = lshr i64 [[TMP13]], 16
+; CHECK-NEXT:    [[EXTRACT_T:%.*]] = trunc i64 [[EXTRACT4]] to i16
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY_PREHEADER17]]
+; CHECK:       for.body.preheader17:
+; CHECK-NEXT:    [[INDVARS_IV_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ACC_010_PH:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[DOT_TRUNC]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    [[ADD_LCSSA_OFF16:%.*]] = phi i16 [ [[EXTRACT_T]], [[MIDDLE_BLOCK]] ], [ [[EXTRACT_T3:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[ACC_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[ADD_LCSSA_OFF16]], [[FOR_COND_CLEANUP_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i16 [[ACC_0_LCSSA]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER17]] ]
+; CHECK-NEXT:    [[ACC_010:%.*]] = phi i32 [ [[ADD:%.*]], [[FOR_BODY]] ], [ [[ACC_010_PH]], [[FOR_BODY_PREHEADER17]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP14]] to i32
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2
+; CHECK-NEXT:    [[CONV3:%.*]] = sext i16 [[TMP15]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV3]], [[CONV]]
+; CHECK-NEXT:    [[ADD]] = add nsw i32 [[MUL]], [[ACC_010]]
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    [[EXTRACT2:%.*]] = lshr i32 [[ADD]], 16
+; CHECK-NEXT:    [[EXTRACT_T3]] = trunc i32 [[EXTRACT2]] to i16
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 4
+  %min.iters.check = icmp ugt i64 %1, %wide.trip.count
+  br i1 %min.iters.check, label %for.body.preheader17, label %vector.ph
+
+vector.ph:                                        ; preds = %for.body.preheader
+  %n.mod.vf = urem i64 %wide.trip.count, %1
+  %n.vec = sub nuw nsw i64 %wide.trip.count, %n.mod.vf
+  %2 = tail call i32 @llvm.vscale.i32()
+  %3 = shl nuw nsw i32 %2, 3
+  %4 = zext i32 %3 to i64
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <vscale x 8 x i32> [ zeroinitializer, %vector.ph ], [ %15, %vector.body ]
+  %vec.phi13 = phi <vscale x 8 x i32> [ zeroinitializer, %vector.ph ], [ %16, %vector.body ]
+  %5 = getelementptr inbounds i16, ptr %a, i64 %index
+  %wide.load = load <vscale x 8 x i16>, ptr %5, align 2
+  %6 = getelementptr inbounds i16, ptr %5, i64 %4
+  %wide.load14 = load <vscale x 8 x i16>, ptr %6, align 2
+  %7 = sext <vscale x 8 x i16> %wide.load to <vscale x 8 x i32>
+  %8 = sext <vscale x 8 x i16> %wide.load14 to <vscale x 8 x i32>
+  %9 = getelementptr inbounds i16, ptr %b, i64 %index
+  %wide.load15 = load <vscale x 8 x i16>, ptr %9, align 2
+  %10 = getelementptr inbounds i16, ptr %9, i64 %4
+  %wide.load16 = load <vscale x 8 x i16>, ptr %10, align 2
+  %11 = sext <vscale x 8 x i16> %wide.load15 to <vscale x 8 x i32>
+  %12 = sext <vscale x 8 x i16> %wide.load16 to <vscale x 8 x i32>
+  %13 = mul nsw <vscale x 8 x i32> %11, %7
+  %14 = mul nsw <vscale x 8 x i32> %12, %8
+  %15 = add <vscale x 8 x i32> %13, %vec.phi
+  %16 = add <vscale x 8 x i32> %14, %vec.phi13
+  %index.next = add nuw i64 %index, %1
+  %17 = icmp eq i64 %index.next, %n.vec
+  br i1 %17, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %bin.rdx = add <vscale x 8 x i32> %16, %15
+  %18 = tail call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> %bin.rdx)
+  %cmp.n = icmp eq i64 %n.mod.vf, 0
+  br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %for.body.preheader17
+
+for.body.preheader17:                             ; preds = %for.body.preheader, %middle.block
+  %indvars.iv.ph = phi i64 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  %acc.010.ph = phi i32 [ 0, %for.body.preheader ], [ %18, %middle.block ]
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body, %middle.block
+  %add.lcssa = phi i32 [ %18, %middle.block ], [ %add, %for.body ]
+  %19 = lshr i32 %add.lcssa, 16
+  %20 = trunc i32 %19 to i16
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %acc.0.lcssa = phi i16 [ 0, %entry ], [ %20, %for.cond.cleanup.loopexit ]
+  ret i16 %acc.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader17, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader17 ]
+  %acc.010 = phi i32 [ %add, %for.body ], [ %acc.010.ph, %for.body.preheader17 ]
+  %arrayidx = getelementptr inbounds i16, ptr %a, i64 %indvars.iv
+  %21 = load i16, ptr %arrayidx, align 2
+  %conv = sext i16 %21 to i32
+  %arrayidx2 = getelementptr inbounds i16, ptr %b, i64 %indvars.iv
+  %22 = load i16, ptr %arrayidx2, align 2
+  %conv3 = sext i16 %22 to i32
+  %mul = mul nsw i32 %conv3, %conv
+  %add = add nsw i32 %mul, %acc.010
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+define i16 @sve_udot_loop_i16_to_i32(ptr readonly %a, ptr readonly %b, i32 %N) #0 {
+; CHECK-LABEL: define i16 @sve_udot_loop_i16_to_i32
+; CHECK-SAME: (ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP11_NOT:%.*]] = icmp eq i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[MIN_ITERS_CHECKED:%.*]]
+; CHECK:       min.iters.checked:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[PREDICATE_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[MIN_ITERS_CHECKED]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[PREDICATE:%.*]] = phi <vscale x 8 x i1> [ [[PREDICATE_ENTRY]], [[MIN_ITERS_CHECKED]] ], [ [[PREDICATE_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOT_ACCUMULATE:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[MIN_ITERS_CHECKED]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP0]], i32 2, <vscale x 8 x i1> [[PREDICATE]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD19:%.*]] = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP1]], i32 2, <vscale x 8 x i1> [[PREDICATE]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP2]] = call <vscale x 2 x i64> @llvm.aarch64.sve.udot.nxv2i64(<vscale x 2 x i64> [[DOT_ACCUMULATE]], <vscale x 8 x i16> [[WIDE_MASKED_LOAD19]], <vscale x 8 x i16> [[WIDE_MASKED_LOAD]])
+; CHECK-NEXT:    [[VS:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[VS_SCALED:%.*]] = shl i64 [[VS]], 3
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[VS_SCALED]]
+; CHECK-NEXT:    [[PREDICATE_NEXT]] = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <vscale x 8 x i1> [[PREDICATE_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP2]])
+; CHECK-NEXT:    [[PHITMP201:%.*]] = lshr i64 [[TMP4]], 16
+; CHECK-NEXT:    [[PHITMP:%.*]] = trunc i64 [[PHITMP201]] to i16
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[ACC_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[PHITMP]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i16 [[ACC_0_LCSSA]]
+;
+entry:
+  %cmp11 = icmp ugt i32 %N, 0
+  br i1 %cmp11, label %min.iters.checked, label %for.cond.cleanup
+
+min.iters.checked:                                ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  %wide.end.idx.splatinsert = insertelement <vscale x 8 x i64> undef, i64 %wide.trip.count, i32 0
+  %wide.end.idx.splat = shufflevector <vscale x 8 x i64> %wide.end.idx.splatinsert, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+  %predicate.entry = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i64(i64 0, i64 %wide.trip.count)
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %min.iters.checked
+  %index = phi i64 [ 0, %min.iters.checked ], [ %index.next, %vector.body ]
+  %predicate = phi <vscale x 8 x i1> [ %predicate.entry, %min.iters.checked ], [ %predicate.next, %vector.body ]
+  %vec.phi = phi <vscale x 8 x i32> [ zeroinitializer, %min.iters.checked ], [ %6, %vector.body ]
+  %0 = getelementptr inbounds i16, ptr %a, i64 %index
+  %wide.masked.load = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %0, i32 2, <vscale x 8 x i1> %predicate, <vscale x 8 x i16> undef)
+  %1 = zext <vscale x 8 x i16> %wide.masked.load to <vscale x 8 x i32>
+  %2 = getelementptr inbounds i16, ptr %b, i64 %index
+  %wide.masked.load19 = call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %2, i32 2, <vscale x 8 x i1> %predicate, <vscale x 8 x i16> undef)
+  %3 = zext <vscale x 8 x i16> %wide.masked.load19 to <vscale x 8 x i32>
+  %4 = mul nsw <vscale x 8 x i32> %3, %1
+  %5 = select <vscale x 8 x i1> %predicate, <vscale x 8 x i32> %4, <vscale x 8 x i32> zeroinitializer
+  %6 = add nsw <vscale x 8 x i32> %vec.phi, %5
+  %vs = call i64 @llvm.vscale.i64()
+  %vs.scaled = mul i64 %vs, 8
+  %index.next = add nuw i64 %index, %vs.scaled
+  %.splatinsert = insertelement <vscale x 8 x i64> undef, i64 %index.next, i32 0
+  %.splat = shufflevector <vscale x 8 x i64> %.splatinsert, <vscale x 8 x i64> undef, <vscale x 8 x i32> zeroinitializer
+  %predicate.next = call <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %7 = extractelement <vscale x 8 x i1> %predicate.next, i64 0
+  br i1 %7, label %vector.body, label %middle.block
+
+middle.block:                                     ; preds = %vector.body
+  %8 = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> %6)
+  %phitmp20 = lshr i32 %8, 16
+  %phitmp = trunc i32 %phitmp20 to i16
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %acc.0.lcssa = phi i16 [ 0, %entry ], [ %phitmp, %middle.block ]
+  ret i16 %acc.0.lcssa
+}
+
+define dso_local i16 @sve_udot_loop_i16_to_i32_interleavedx4_foldedtail(ptr readonly %a, ptr readonly %b, i32 %N) #0 {
+; CHECK-LABEL: define dso_local i16 @sve_udot_loop_i16_to_i32_interleavedx4_foldedtail
+; CHECK-SAME: (ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP9:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP9]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = tail call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw nsw i64 [[TMP0]], 24
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY16:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP2]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY15:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP1]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY17:%.*]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP3]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i32 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw nsw i32 [[TMP4]], 24
+; CHECK-NEXT:    [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP0]], 5
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK18:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY15]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT31:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK19:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY16]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT32:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK20:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY17]], [[FOR_BODY_PREHEADER]] ], [ [[ACTIVE_LANE_MASK_NEXT33:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOT_ACCUMULATE3:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOT_ACCUMULATE2:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOT_ACCUMULATE1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOT_ACCUMULATE:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[FOR_BODY_PREHEADER]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP12]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[TMP12]], i64 [[TMP6]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD24:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nonnull [[TMP13]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK18]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP12]], i64 [[TMP8]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD25:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nonnull [[TMP14]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK19]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP12]], i64 [[TMP10]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD26:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nonnull [[TMP15]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK20]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD27:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr [[TMP16]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP16]], i64 [[TMP6]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD28:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nonnull [[TMP17]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK18]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[TMP16]], i64 [[TMP8]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD29:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nonnull [[TMP18]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK19]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP16]], i64 [[TMP10]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD30:%.*]] = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nonnull [[TMP19]], i32 2, <vscale x 8 x i1> [[ACTIVE_LANE_MASK20]], <vscale x 8 x i16> zeroinitializer)
+; CHECK-NEXT:    [[TMP20]] = call <vscale x 2 x i64> @llvm.aarch64.sve.udot.nxv2i64(<vscale x 2 x i64> [[DOT_ACCUMULATE3]], <vscale x 8 x i16> [[WIDE_MASKED_LOAD27]], <vscale x 8 x i16> [[WIDE_MASKED_LOAD]])
+; CHECK-NEXT:    [[TMP21]] = call <vscale x 2 x i64> @llvm.aarch64.sve.udot.nxv2i64(<vscale x 2 x i64> [[DOT_ACCUMULATE2]], <vscale x 8 x i16> [[WIDE_MASKED_LOAD28]], <vscale x 8 x i16> [[WIDE_MASKED_LOAD24]])
+; CHECK-NEXT:    [[TMP22]] = call <vscale x 2 x i64> @llvm.aarch64.sve.udot.nxv2i64(<vscale x 2 x i64> [[DOT_ACCUMULATE1]], <vscale x 8 x i16> [[WIDE_MASKED_LOAD29]], <vscale x 8 x i16> [[WIDE_MASKED_LOAD25]])
+; CHECK-NEXT:    [[TMP23]] = call <vscale x 2 x i64> @llvm.aarch64.sve.udot.nxv2i64(<vscale x 2 x i64> [[DOT_ACCUMULATE]], <vscale x 8 x i16> [[WIDE_MASKED_LOAD30]], <vscale x 8 x i16> [[WIDE_MASKED_LOAD26]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[INDEX_NEXT]], [[TMP1]]
+; CHECK-NEXT:    [[TMP25:%.*]] = add i64 [[INDEX_NEXT]], [[TMP2]]
+; CHECK-NEXT:    [[TMP26:%.*]] = add i64 [[INDEX_NEXT]], [[TMP3]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT31]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP24]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT32]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP25]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT33]] = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[TMP26]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP27]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP28:%.*]] = add <vscale x 2 x i64> [[TMP23]], [[TMP22]]
+; CHECK-NEXT:    [[TMP29:%.*]] = add <vscale x 2 x i64> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP30:%.*]] = add <vscale x 2 x i64> [[TMP28]], [[TMP29]]
+; CHECK-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP30]])
+; CHECK-NEXT:    [[TMP32:%.*]] = lshr i64 [[TMP31]], 16
+; CHECK-NEXT:    [[TMP33:%.*]] = trunc i64 [[TMP32]] to i16
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[ACC_0_LCSSA:%.*]] = phi i16 [ 0, [[ENTRY:%.*]] ], [ [[TMP33]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i16 [[ACC_0_LCSSA]]
+;
+entry:
+  %cmp9 = icmp sgt i32 %N, 0
+  br i1 %cmp9, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  %0 = tail call i64 @llvm.vscale.i64()
+  %1 = shl nuw nsw i64 %0, 3
+  %2 = shl nuw nsw i64 %0, 4
+  %3 = mul nuw nsw i64 %0, 24
+  %active.lane.mask.entry = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 %wide.trip.count)
+  %active.lane.mask.entry16 = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %2, i64 %wide.trip.count)
+  %active.lane.mask.entry15 = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %1, i64 %wide.trip.count)
+  %active.lane.mask.entry17 = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %3, i64 %wide.trip.count)
+  %4 = tail call i32 @llvm.vscale.i32()
+  %5 = shl nuw nsw i32 %4, 3
+  %6 = zext i32 %5 to i64
+  %7 = shl nuw nsw i32 %4, 4
+  %8 = zext i32 %7 to i64
+  %9 = mul nuw nsw i32 %4, 24
+  %10 = zext i32 %9 to i64
+  %11 = shl nuw nsw i64 %0, 5
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %for.body.preheader
+  %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ]
+  %active.lane.mask = phi <vscale x 8 x i1> [ %active.lane.mask.entry, %for.body.preheader ], [ %active.lane.mask.next, %vector.body ]
+  %active.lane.mask18 = phi <vscale x 8 x i1> [ %active.lane.mask.entry15, %for.body.preheader ], [ %active.lane.mask.next31, %vector.body ]
+  %active.lane.mask19 = phi <vscale x 8 x i1> [ %active.lane.mask.entry16, %for.body.preheader ], [ %active.lane.mask.next32, %vector.body ]
+  %active.lane.mask20 = phi <vscale x 8 x i1> [ %active.lane.mask.entry17, %for.body.preheader ], [ %active.lane.mask.next33, %vector.body ]
+  %vec.phi = phi <vscale x 8 x i32> [ zeroinitializer, %for.body.preheader ], [ %33, %vector.body ]
+  %vec.phi21 = phi <vscale x 8 x i32> [ zeroinitializer, %for.body.preheader ], [ %35, %vector.body ]
+  %vec.phi22 = phi <vscale x 8 x i32> [ zeroinitializer, %for.body.preheader ], [ %37, %vector.body ]
+  %vec.phi23 = phi <vscale x 8 x i32> [ zeroinitializer, %for.body.preheader ], [ %39, %vector.body ]
+  %12 = getelementptr inbounds i16, ptr %a, i64 %index
+  %wide.masked.load = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %12, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
+  %13 = getelementptr inbounds i16, ptr %12, i64 %6
+  %wide.masked.load24 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nonnull %13, i32 2, <vscale x 8 x i1> %active.lane.mask18, <vscale x 8 x i16> poison)
+  %14 = getelementptr inbounds i16, ptr %12, i64 %8
+  %wide.masked.load25 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nonnull %14, i32 2, <vscale x 8 x i1> %active.lane.mask19, <vscale x 8 x i16> poison)
+  %15 = getelementptr inbounds i16, ptr %12, i64 %10
+  %wide.masked.load26 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nonnull %15, i32 2, <vscale x 8 x i1> %active.lane.mask20, <vscale x 8 x i16> poison)
+  %16 = zext <vscale x 8 x i16> %wide.masked.load to <vscale x 8 x i32>
+  %17 = zext <vscale x 8 x i16> %wide.masked.load24 to <vscale x 8 x i32>
+  %18 = zext <vscale x 8 x i16> %wide.masked.load25 to <vscale x 8 x i32>
+  %19 = zext <vscale x 8 x i16> %wide.masked.load26 to <vscale x 8 x i32>
+  %20 = getelementptr inbounds i16, ptr %b, i64 %index
+  %wide.masked.load27 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr %20, i32 2, <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i16> poison)
+  %21 = getelementptr inbounds i16, ptr %20, i64 %6
+  %wide.masked.load28 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nonnull %21, i32 2, <vscale x 8 x i1> %active.lane.mask18, <vscale x 8 x i16> poison)
+  %22 = getelementptr inbounds i16, ptr %20, i64 %8
+  %wide.masked.load29 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nonnull %22, i32 2, <vscale x 8 x i1> %active.lane.mask19, <vscale x 8 x i16> poison)
+  %23 = getelementptr inbounds i16, ptr %20, i64 %10
+  %wide.masked.load30 = tail call <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr nonnull %23, i32 2, <vscale x 8 x i1> %active.lane.mask20, <vscale x 8 x i16> poison)
+  %24 = zext <vscale x 8 x i16> %wide.masked.load27 to <vscale x 8 x i32>
+  %25 = zext <vscale x 8 x i16> %wide.masked.load28 to <vscale x 8 x i32>
+  %26 = zext <vscale x 8 x i16> %wide.masked.load29 to <vscale x 8 x i32>
+  %27 = zext <vscale x 8 x i16> %wide.masked.load30 to <vscale x 8 x i32>
+  %28 = mul nuw nsw <vscale x 8 x i32> %24, %16
+  %29 = mul nuw nsw <vscale x 8 x i32> %25, %17
+  %30 = mul nuw nsw <vscale x 8 x i32> %26, %18
+  %31 = mul nuw nsw <vscale x 8 x i32> %27, %19
+  %32 = select <vscale x 8 x i1> %active.lane.mask, <vscale x 8 x i32> %28, <vscale x 8 x i32> zeroinitializer
+  %33 = add <vscale x 8 x i32> %vec.phi, %32
+  %34 = select <vscale x 8 x i1> %active.lane.mask18, <vscale x 8 x i32> %29, <vscale x 8 x i32> zeroinitializer
+  %35 = add <vscale x 8 x i32> %vec.phi21, %34
+  %36 = select <vscale x 8 x i1> %active.lane.mask19, <vscale x 8 x i32> %30, <vscale x 8 x i32> zeroinitializer
+  %37 = add <vscale x 8 x i32> %vec.phi22, %36
+  %38 = select <vscale x 8 x i1> %active.lane.mask20, <vscale x 8 x i32> %31, <vscale x 8 x i32> zeroinitializer
+  %39 = add <vscale x 8 x i32> %vec.phi23, %38
+  %index.next = add i64 %index, %11
+  %40 = add i64 %index.next, %1
+  %41 = add i64 %index.next, %2
+  %42 = add i64 %index.next, %3
+  %active.lane.mask.next = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %active.lane.mask.next31 = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %40, i64 %wide.trip.count)
+  %active.lane.mask.next32 = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %41, i64 %wide.trip.count)
+  %active.lane.mask.next33 = tail call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 %42, i64 %wide.trip.count)
+  %43 = extractelement <vscale x 8 x i1> %active.lane.mask.next, i64 0
+  br i1 %43, label %vector.body, label %middle.block
+
+middle.block:                                     ; preds = %vector.body
+  %bin.rdx = add <vscale x 8 x i32> %35, %33
+  %bin.rdx34 = add <vscale x 8 x i32> %37, %bin.rdx
+  %bin.rdx35 = add <vscale x 8 x i32> %39, %bin.rdx34
+  %44 = tail call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> %bin.rdx35)
+  %45 = lshr i32 %44, 16
+  %46 = trunc i32 %45 to i16
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %acc.0.lcssa = phi i16 [ 0, %entry ], [ %46, %middle.block ]
+  ret i16 %acc.0.lcssa
+}
+
+define i8 @sve_sdot_loop_i8_to_i16(ptr readonly %a, ptr readonly %b, i32 %N) #0 {
+; CHECK-LABEL: define i8 @sve_sdot_loop_i8_to_i16
+; CHECK-SAME: (ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[MIN_ITERS_CHECKED:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       min.iters.checked:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[PREDICATE_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[MIN_ITERS_CHECKED]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[PREDICATE:%.*]] = phi <vscale x 16 x i1> [ [[PREDICATE_ENTRY]], [[MIN_ITERS_CHECKED]] ], [ [[PREDICATE_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOT_ACCUMULATE:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[MIN_ITERS_CHECKED]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP0]], i32 1, <vscale x 16 x i1> [[PREDICATE]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD19:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP1]], i32 1, <vscale x 16 x i1> [[PREDICATE]], <vscale x 16 x i8> zeroinitializer)
+; CHECK-NEXT:    [[TMP2]] = call <vscale x 4 x i32> @llvm.aarch64.sve.sdot.nxv4i32(<vscale x 4 x i32> [[DOT_ACCUMULATE]], <vscale x 16 x i8> [[WIDE_MASKED_LOAD19]], <vscale x 16 x i8> [[WIDE_MASKED_LOAD]])
+; CHECK-NEXT:    [[VS:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[VS_SCALED:%.*]] = shl i64 [[VS]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[VS_SCALED]]
+; CHECK-NEXT:    [[PREDICATE_NEXT]] = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <vscale x 16 x i1> [[PREDICATE_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[PHITMP201:%.*]] = lshr i32 [[TMP4]], 8
+; CHECK-NEXT:    [[PHITMP:%.*]] = trunc i32 [[PHITMP201]] to i8
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[ACC_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[PHITMP]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i8 [[ACC_0_LCSSA]]
+;
+entry:
+  %cmp11 = icmp sgt i32 %N, 0
+  br i1 %cmp11, label %min.iters.checked, label %for.cond.cleanup
+
+min.iters.checked:                                ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  %wide.end.idx.splatinsert = insertelement <vscale x 16 x i64> undef, i64 %wide.trip.count, i32 0
+  %wide.end.idx.splat = shufflevector <vscale x 16 x i64> %wide.end.idx.splatinsert, <vscale x 16 x i64> undef, <vscale x 16 x i32> zeroinitializer
+  %predicate.entry = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64 0, i64 %wide.trip.count)
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %min.iters.checked
+  %index = phi i64 [ 0, %min.iters.checked ], [ %index.next, %vector.body ]
+  %predicate = phi <vscale x 16 x i1> [ %predicate.entry, %min.iters.checked ], [ %predicate.next, %vector.body ]
+  %vec.phi = phi <vscale x 16 x i16> [ zeroinitializer, %min.iters.checked ], [ %6, %vector.body ]
+  %0 = getelementptr inbounds i8, ptr %a, i64 %index
+  %wide.masked.load = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %0, i32 1, <vscale x 16 x i1> %predicate, <vscale x 16 x i8> undef)
+  %1 = sext <vscale x 16 x i8> %wide.masked.load to <vscale x 16 x i16>
+  %2 = getelementptr inbounds i8, i8* %b, i64 %index
+  %wide.masked.load19 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %2, i32 1, <vscale x 16 x i1> %predicate, <vscale x 16 x i8> undef)
+  %3 = sext <vscale x 16 x i8> %wide.masked.load19 to <vscale x 16 x i16>
+  %4 = mul nsw <vscale x 16 x i16> %3, %1
+  %5 = select <vscale x 16 x i1> %predicate, <vscale x 16 x i16> %4, <vscale x 16 x i16> zeroinitializer
+  %6 = add nsw <vscale x 16 x i16> %vec.phi, %5
+  %vs = call i64 @llvm.vscale.i64()
+  %vs.scaled = mul i64 %vs, 16
+  %index.next = add nuw i64 %index, %vs.scaled
+  %.splatinsert = insertelement <vscale x 16 x i64> undef, i64 %index.next, i32 0
+  %.splat = shufflevector <vscale x 16 x i64> %.splatinsert, <vscale x 16 x i64> undef, <vscale x 16 x i32> zeroinitializer
+  %predicate.next = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %7 = extractelement <vscale x 16 x i1> %predicate.next, i64 0
+  br i1 %7, label %vector.body, label %middle.block
+
+middle.block:                                     ; preds = %vector.body
+  %8 = call i16 @llvm.vector.reduce.add.nxv16i16(<vscale x 16 x i16> %6)
+  %phitmp20 = lshr i16 %8, 8
+  %phitmp = trunc i16 %phitmp20 to i8
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %acc.0.lcssa = phi i8 [ 0, %entry ], [ %phitmp, %middle.block ]
+  ret i8 %acc.0.lcssa
+}
+
+define i8 @sve_udot_loop_i8_to_i16(ptr readonly %a, ptr readonly %b, i32 %N) #0 {
+; CHECK-LABEL: define i8 @sve_udot_loop_i8_to_i16
+; CHECK-SAME: (ptr readonly [[A:%.*]], ptr readonly [[B:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP11]], label [[MIN_ITERS_CHECKED:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       min.iters.checked:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[PREDICATE_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64 0, i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[MIN_ITERS_CHECKED]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[PREDICATE:%.*]] = phi <vscale x 16 x i1> [ [[PREDICATE_ENTRY]], [[MIN_ITERS_CHECKED]] ], [ [[PREDICATE_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[DOT_ACCUMULATE:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[MIN_ITERS_CHECKED]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP0]], i32 1, <vscale x 16 x i1> [[PREDICATE]], <vscale x 16 x i8> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD19:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP1]], i32 1, <vscale x 16 x i1> [[PREDICATE]], <vscale x 16 x i8> undef)
+; CHECK-NEXT:    [[TMP2]] = call <vscale x 4 x i32> @llvm.aarch64.sve.udot.nxv4i32(<vscale x 4 x i32> [[DOT_ACCUMULATE]], <vscale x 16 x i8> [[WIDE_MASKED_LOAD19]], <vscale x 16 x i8> [[WIDE_MASKED_LOAD]])
+; CHECK-NEXT:    [[VS:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[VS_SCALED:%.*]] = shl i64 [[VS]], 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[VS_SCALED]]
+; CHECK-NEXT:    [[PREDICATE_NEXT]] = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 [[WIDE_TRIP_COUNT]])
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <vscale x 16 x i1> [[PREDICATE_NEXT]], i64 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP2]])
+; CHECK-NEXT:    [[PHITMP201:%.*]] = lshr i32 [[TMP4]], 8
+; CHECK-NEXT:    [[PHITMP:%.*]] = trunc i32 [[PHITMP201]] to i8
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    [[ACC_0_LCSSA:%.*]] = phi i8 [ 0, [[ENTRY:%.*]] ], [ [[PHITMP]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i8 [[ACC_0_LCSSA]]
+;
+entry:
+  %cmp11 = icmp sgt i32 %N, 0
+  br i1 %cmp11, label %min.iters.checked, label %for.cond.cleanup
+
+min.iters.checked:                                ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  %wide.end.idx.splatinsert = insertelement <vscale x 16 x i64> undef, i64 %wide.trip.count, i32 0
+  %wide.end.idx.splat = shufflevector <vscale x 16 x i64> %wide.end.idx.splatinsert, <vscale x 16 x i64> undef, <vscale x 16 x i32> zeroinitializer
+  %predicate.entry = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64 0, i64 %wide.trip.count)
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %min.iters.checked
+  %index = phi i64 [ 0, %min.iters.checked ], [ %index.next, %vector.body ]
+  %predicate = phi <vscale x 16 x i1> [ %predicate.entry, %min.iters.checked ], [ %predicate.next, %vector.body ]
+  %vec.phi = phi <vscale x 16 x i16> [ zeroinitializer, %min.iters.checked ], [ %5, %vector.body ]
+  %0 = getelementptr inbounds i8, ptr %a, i64 %index
+  %wide.masked.load = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %0, i32 1, <vscale x 16 x i1> %predicate, <vscale x 16 x i8> undef)
+  %1 = zext <vscale x 16 x i8> %wide.masked.load to <vscale x 16 x i16>
+  %2 = getelementptr inbounds i8, i8* %b, i64 %index
+  %wide.masked.load19 = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr %2, i32 1, <vscale x 16 x i1> %predicate, <vscale x 16 x i8> undef)
+  %3 = zext <vscale x 16 x i8> %wide.masked.load19 to <vscale x 16 x i16>
+  %4 = mul nsw <vscale x 16 x i16> %3, %1
+  %5 = add nsw <vscale x 16 x i16> %vec.phi, %4
+  %vs = call i64 @llvm.vscale.i64()
+  %vs.scaled = mul i64 %vs, 16
+  %index.next = add nuw i64 %index, %vs.scaled
+  %.splatinsert = insertelement <vscale x 16 x i64> undef, i64 %index.next, i32 0
+  %.splat = shufflevector <vscale x 16 x i64> %.splatinsert, <vscale x 16 x i64> undef, <vscale x 16 x i32> zeroinitializer
+  %predicate.next = call <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64 %index.next, i64 %wide.trip.count)
+  %6 = extractelement <vscale x 16 x i1> %predicate.next, i64 0
+  br i1 %6, label %vector.body, label %middle.block
+
+middle.block:                                     ; preds = %vector.body
+  %7 = call i16 @llvm.vector.reduce.add.nxv16i16(<vscale x 16 x i16> %5)
+  %phitmp20 = lshr i16 %7, 8
+  %phitmp = trunc i16 %phitmp20 to i8
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %middle.block, %entry
+  %acc.0.lcssa = phi i8 [ 0, %entry ], [ %phitmp, %middle.block ]
+  ret i8 %acc.0.lcssa
+}
+
+define i64 @sve_sdot_i16_to_i64(<vscale x 8 x i16> %a, <vscale x 8 x i16> %b) #0 {
+; CHECK-LABEL: define i64 @sve_sdot_i16_to_i64
+; CHECK-SAME: (<vscale x 8 x i16> [[A:%.*]], <vscale x 8 x i16> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 2 x i64> @llvm.aarch64.sve.sdot.nxv2i64(<vscale x 2 x i64> zeroinitializer, <vscale x 8 x i16> [[A]], <vscale x 8 x i16> [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP0]])
+; CHECK-NEXT:    ret i64 [[TMP1]]
+;
+entry:
+  %exta = sext <vscale x 8 x i16> %a to <vscale x 8 x i64>
+  %extb = sext <vscale x 8 x i16> %b to <vscale x 8 x i64>
+  %mul = mul nsw <vscale x 8 x i64> %exta, %extb
+  %acc = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> %mul)
+  ret i64 %acc
+}
+
+define i32 @sve_udot_i8_to_i32(<vscale x 16 x i8> %a, <vscale x 16 x i8> %b) #0 {
+; CHECK-LABEL: define i32 @sve_udot_i8_to_i32
+; CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]], <vscale x 16 x i8> [[B:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.aarch64.sve.udot.nxv4i32(<vscale x 4 x i32> zeroinitializer, <vscale x 16 x i8> [[A]], <vscale x 16 x i8> [[B]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP0]])
+; CHECK-NEXT:    ret i32 [[TMP1]]
+;
+entry:
+  %exta = zext <vscale x 16 x i8> %a to <vscale x 16 x i32>
+  %extb = zext <vscale x 16 x i8> %b to <vscale x 16 x i32>
+  %mul = mul nsw <vscale x 16 x i32> %exta, %extb
+  %acc = call i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32> %mul)
+  ret i32 %acc
+}
+
+declare <vscale x 8 x i16> @llvm.masked.load.nxv8i16.p0(ptr, i32, <vscale x 8 x i1>, <vscale x 8 x i16>)
+declare <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr, i32, <vscale x 16 x i1>, <vscale x 16 x i8>)
+declare i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32>)
+declare i16 @llvm.vector.reduce.add.nxv16i16(<vscale x 16 x i16>)
+declare i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64>)
+declare i32 @llvm.vector.reduce.add.nxv16i32(<vscale x 16 x i32>)
+declare i64 @llvm.vscale.i64()
+declare i32 @llvm.vscale.i32()
+declare <vscale x 8 x i1> @llvm.aarch64.sve.whilelo.nxv8i1.i64(i64, i64)
+declare <vscale x 16 x i1> @llvm.aarch64.sve.whilelo.nxv16i1.i64(i64, i64)
+declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64)
+
+attributes #0 = { "target-features"="+sve" }