[llvm] 0171862 - [Hexagon] Place aligned loads closer to users

Fri Nov 11 12:04:52 PST 2022

Author: Krzysztof Parzyszek
Date: 2022-11-11T12:04:33-08:00
New Revision: 017186294d933921d1667ffbad82981dda09dacd

URL: https://github.com/llvm/llvm-project/commit/017186294d933921d1667ffbad82981dda09dacd
DIFF: https://github.com/llvm/llvm-project/commit/017186294d933921d1667ffbad82981dda09dacd.diff

LOG: [Hexagon] Place aligned loads closer to users

Vector alignment code was grouping all aligned loads together. In some
cases the groups could become quite large causing a lot of spill to be
generated. This will place the loads closer to where they are used,
reducing the register pressure.

Added: 
    llvm/test/CodeGen/Hexagon/autohvx/vector-align-interleaved.ll
    llvm/test/CodeGen/Hexagon/autohvx/vector-align-only-phi-use.ll
    llvm/test/CodeGen/Hexagon/autohvx/vector-align-use-in-different-block.ll

Modified: 
    llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
    llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 4efea50ea7698..279ce3483ee68 100644

--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -141,7 +141,7 @@ class HexagonVectorCombine {
   template <typename T = std::vector<Instruction *>>
   bool isSafeToMoveBeforeInBB(const Instruction &In,
                               BasicBlock::const_iterator To,
-                              const T &Ignore = {}) const;
+                              const T &IgnoreInsts = {}) const;
 
   // This function is only used for assertions at the moment.
   [[maybe_unused]] bool isByteVecTy(Type *Ty) const;
@@ -271,14 +271,20 @@ class AlignVectors {
   Value *createAlignedStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
                             int Alignment, Value *Mask) const;
 
+  DepList getUpwardDeps(Instruction *In, Instruction *Base) const;
   bool createAddressGroups();
   MoveList createLoadGroups(const AddrList &Group) const;
   MoveList createStoreGroups(const AddrList &Group) const;
   bool move(const MoveGroup &Move) const;
+  void realignLoadGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
+                        int ScLen, Value *AlignVal, Value *AlignAddr) const;
+  void realignStoreGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
+                         int ScLen, Value *AlignVal, Value *AlignAddr) const;
   bool realignGroup(const MoveGroup &Move) const;
 
   friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
   friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
+  friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);
   friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
 
   std::map<Instruction *, AddrList> AddrGroups;
@@ -307,13 +313,19 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
   return OS;
 }
 
+LLVM_ATTRIBUTE_UNUSED
+raw_ostream &operator<<(raw_ostream &OS,
+                        const AlignVectors::ByteSpan::Block &B) {
+  OS << "  @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] "
+     << *B.Seg.Val;
+  return OS;
+}
+
 LLVM_ATTRIBUTE_UNUSED
 raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
   OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
-  for (const AlignVectors::ByteSpan::Block &B : BS) {
-    OS << "  @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] "
-       << *B.Seg.Val << '\n';
-  }
+  for (const AlignVectors::ByteSpan::Block &B : BS)
+    OS << B << '\n';
   OS << ']';
   return OS;
 }
@@ -582,6 +594,29 @@ auto AlignVectors::createAlignedStore(IRBuilderBase &Builder, Value *Val,
   return Builder.CreateMaskedStore(Val, Ptr, Align(Alignment), Mask);
 }
 
+auto AlignVectors::getUpwardDeps(Instruction *In, Instruction *Base) const
+    -> DepList {
+  BasicBlock *Parent = Base->getParent();
+  assert(In->getParent() == Parent &&
+         "Base and In should be in the same block");
+  assert(Base->comesBefore(In) && "Base should come before In");
+
+  DepList Deps;
+  std::deque<Instruction *> WorkQ = {In};
+  while (!WorkQ.empty()) {
+    Instruction *D = WorkQ.front();
+    WorkQ.pop_front();
+    Deps.insert(D);
+    for (Value *Op : D->operands()) {
+      if (auto *I = dyn_cast<Instruction>(Op)) {
+        if (I->getParent() == Parent && Base->comesBefore(I))
+          WorkQ.push_back(I);
+      }
+    }
+  }
+  return Deps;
+}
+
 auto AlignVectors::createAddressGroups() -> bool {
   // An address group created here may contain instructions spanning
   // multiple basic blocks.
@@ -641,28 +676,6 @@ auto AlignVectors::createLoadGroups(const AddrList &Group) const -> MoveList {
   // To avoid complications with moving code across basic blocks, only form
   // groups that are contained within a single basic block.
 
-  auto getUpwardDeps = [](Instruction *In, Instruction *Base) {
-    BasicBlock *Parent = Base->getParent();
-    assert(In->getParent() == Parent &&
-           "Base and In should be in the same block");
-    assert(Base->comesBefore(In) && "Base should come before In");
-
-    DepList Deps;
-    std::deque<Instruction *> WorkQ = {In};
-    while (!WorkQ.empty()) {
-      Instruction *D = WorkQ.front();
-      WorkQ.pop_front();
-      Deps.insert(D);
-      for (Value *Op : D->operands()) {
-        if (auto *I = dyn_cast<Instruction>(Op)) {
-          if (I->getParent() == Parent && Base->comesBefore(I))
-            WorkQ.push_back(I);
-        }
-      }
-    }
-    return Deps;
-  };
-
   auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
     assert(!Move.Main.empty() && "Move group should have non-empty Main");
     // Don't mix HVX and non-HVX instructions.
@@ -775,6 +788,245 @@ auto AlignVectors::move(const MoveGroup &Move) const -> bool {
   return Move.Main.size() + Move.Deps.size() > 1;
 }
 
+auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
+                                    const ByteSpan &VSpan, int ScLen,
+                                    Value *AlignVal, Value *AlignAddr) const
+    -> void {
+  Type *SecTy = HVC.getByteTy(ScLen);
+  int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
+  bool DoAlign = !HVC.isZero(AlignVal);
+  BasicBlock::iterator BasePos = Builder.GetInsertPoint();
+  BasicBlock *BaseBlock = Builder.GetInsertBlock();
+
+  ByteSpan ASpan;
+  auto *True = HVC.getFullValue(HVC.getBoolTy(ScLen));
+  auto *Undef = UndefValue::get(SecTy);
+
+  SmallVector<Instruction *> Loads(NumSectors + DoAlign, nullptr);
+
+  // We could create all of the aligned loads, and generate the valigns
+  // at the location of the first load, but for large load groups, this
+  // could create highly suboptimal code (there have been groups of 140+
+  // loads in real code).
+  // Instead, place the loads/valigns as close to the users as possible.
+  // In any case we need to have a mapping from the blocks of VSpan (the
+  // span covered by the pre-existing loads) to ASpan (the span covered
+  // by the aligned loads). There is a small problem, though: ASpan needs
+  // to have pointers to the loads/valigns, but we don't know where to put
+  // them yet. We can't use nullptr, because when we create sections of
+  // ASpan (corresponding to blocks from VSpan), for each block in the
+  // section we need to know which blocks of ASpan they are a part of.
+  // To have 1-1 mapping between blocks of ASpan and the temporary value
+  // pointers, use the addresses of the blocks themselves.
+
+  // Populate the blocks first, to avoid reallocations of the vector
+  // interfering with generating the placeholder addresses.
+  for (int Index = 0; Index != NumSectors; ++Index)
+    ASpan.Blocks.emplace_back(nullptr, ScLen, Index * ScLen);
+  for (int Index = 0; Index != NumSectors; ++Index) {
+    ASpan.Blocks[Index].Seg.Val =
+        reinterpret_cast<Value *>(&ASpan.Blocks[Index]);
+  }
+
+  // Multiple values from VSpan can map to the same value in ASpan. Since we
+  // try to create loads lazily, we need to find the earliest use for each
+  // value from ASpan.
+  DenseMap<void *, Instruction *> EarliestUser;
+  auto isEarlier = [](Instruction *A, Instruction *B) {
+    if (B == nullptr)
+      return true;
+    if (A == nullptr)
+      return false;
+    assert(A->getParent() == B->getParent());
+    return A->comesBefore(B);
+  };
+  auto earliestUser = [&](const auto &Uses) {
+    Instruction *User = nullptr;
+    for (const Use &U : Uses) {
+      auto *I = dyn_cast<Instruction>(U.getUser());
+      assert(I != nullptr && "Load used in a non-instruction?");
+      // Make sure we only consider at users in this block, but we need
+      // to remember if there were users outside the block too. This is
+      // because if there are no users, aligned loads will not be created.
+      if (I->getParent() == BaseBlock) {
+        if (!isa<PHINode>(I))
+          User = std::min(User, I, isEarlier);
+      } else {
+        User = std::min(User, BaseBlock->getTerminator(), isEarlier);
+      }
+    }
+    return User;
+  };
+
+  for (const ByteSpan::Block &B : VSpan) {
+    ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size);
+    for (const ByteSpan::Block &S : ASection) {
+      EarliestUser[S.Seg.Val] = std::min(
+          EarliestUser[S.Seg.Val], earliestUser(B.Seg.Val->uses()), isEarlier);
+    }
+  }
+
+  auto createLoad = [&](IRBuilderBase &Builder, const ByteSpan &VSpan,
+                        int Index) {
+    Value *Ptr =
+        createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);
+    // FIXME: generate a predicated load?
+    Value *Load = createAlignedLoad(Builder, SecTy, Ptr, ScLen, True, Undef);
+    // If vector shifting is potentially needed, accumulate metadata
+    // from source sections of twice the load width.
+    int Start = (Index - DoAlign) * ScLen;
+    int Width = (1 + DoAlign) * ScLen;
+    propagateMetadata(cast<Instruction>(Load),
+                      VSpan.section(Start, Width).values());
+    return cast<Instruction>(Load);
+  };
+
+  auto moveBefore = [this](Instruction *In, Instruction *To) {
+    // Move In and its upward dependencies to before To.
+    assert(In->getParent() == To->getParent());
+    DepList Deps = getUpwardDeps(In, To);
+    // DepList is sorted with respect to positions in the basic block.
+    for (Instruction *I : Deps)
+      I->moveBefore(To);
+  };
+
+  // Generate necessary loads at appropriate locations.
+  for (int Index = 0; Index != NumSectors + 1; ++Index) {
+    // In ASpan, each block will be either a single aligned load, or a
+    // valign of a pair of loads. In the latter case, an aligned load j
+    // will belong to the current valign, and the one in the previous
+    // block (for j > 0).
+    Instruction *PrevAt =
+        DoAlign && Index > 0 ? EarliestUser[&ASpan[Index - 1]] : nullptr;
+    Instruction *ThisAt =
+        Index < NumSectors ? EarliestUser[&ASpan[Index]] : nullptr;
+    if (auto *Where = std::min(PrevAt, ThisAt, isEarlier)) {
+      Builder.SetInsertPoint(Where);
+      Loads[Index] = createLoad(Builder, VSpan, Index);
+      // We know it's safe to put the load at BasePos, so if it's not safe
+      // to move it from this location to BasePos, then the current location
+      // is not valid.
+      // We can't do this check proactively because we need the load to exist
+      // in order to check legality.
+      if (!HVC.isSafeToMoveBeforeInBB(*Loads[Index], BasePos))
+        moveBefore(Loads[Index], &*BasePos);
+    }
+  }
+  // Generate valigns if needed, and fill in proper values in ASpan
+  for (int Index = 0; Index != NumSectors; ++Index) {
+    ASpan[Index].Seg.Val = nullptr;
+    if (auto *Where = EarliestUser[&ASpan[Index]]) {
+      Builder.SetInsertPoint(Where);
+      Value *Val = Loads[Index];
+      assert(Val != nullptr);
+      if (DoAlign) {
+        Value *NextLoad = Loads[Index + 1];
+        assert(NextLoad != nullptr);
+        Val = HVC.vralignb(Builder, Val, NextLoad, AlignVal);
+      }
+      ASpan[Index].Seg.Val = Val;
+    }
+  }
+
+  for (const ByteSpan::Block &B : VSpan) {
+    ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
+    Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
+    Builder.SetInsertPoint(cast<Instruction>(B.Seg.Val));
+
+    for (ByteSpan::Block &S : ASection) {
+      if (S.Seg.Val == nullptr)
+        continue;
+      // The processing of the data loaded by the aligned loads
+      // needs to be inserted after the data is available.
+      Instruction *SegI = cast<Instruction>(S.Seg.Val);
+      Builder.SetInsertPoint(&*std::next(SegI->getIterator()));
+      Value *Pay = HVC.vbytes(Builder, getPayload(S.Seg.Val));
+      Accum = HVC.insertb(Builder, Accum, Pay, S.Seg.Start, S.Seg.Size, S.Pos);
+    }
+    // Instead of casting everything to bytes for the vselect, cast to the
+    // original value type. This will avoid complications with casting masks.
+    // For example, in cases when the original mask applied to i32, it could
+    // be converted to a mask applicable to i8 via pred_typecast intrinsic,
+    // but if the mask is not exactly of HVX length, extra handling would be
+    // needed to make it work.
+    Type *ValTy = getPayload(B.Seg.Val)->getType();
+    Value *Cast = Builder.CreateBitCast(Accum, ValTy);
+    Value *Sel = Builder.CreateSelect(getMask(B.Seg.Val), Cast,
+                                      getPassThrough(B.Seg.Val));
+    B.Seg.Val->replaceAllUsesWith(Sel);
+  }
+}
+
+auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,
+                                     const ByteSpan &VSpan, int ScLen,
+                                     Value *AlignVal, Value *AlignAddr) const
+    -> void {
+  Type *SecTy = HVC.getByteTy(ScLen);
+  int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
+  bool DoAlign = !HVC.isZero(AlignVal);
+
+  // Stores.
+  ByteSpan ASpanV, ASpanM;
+
+  // Return a vector value corresponding to the input value Val:
+  // either <1 x Val> for scalar Val, or Val itself for vector Val.
+  auto MakeVec = [](IRBuilderBase &Builder, Value *Val) -> Value * {
+    Type *Ty = Val->getType();
+    if (Ty->isVectorTy())
+      return Val;
+    auto *VecTy = VectorType::get(Ty, 1, /*Scalable=*/false);
+    return Builder.CreateBitCast(Val, VecTy);
+  };
+
+  // Create an extra "undef" sector at the beginning and at the end.
+  // They will be used as the left/right filler in the vlalign step.
+  for (int i = (DoAlign ? -1 : 0); i != NumSectors + DoAlign; ++i) {
+    // For stores, the size of each section is an aligned vector length.
+    // Adjust the store offsets relative to the section start offset.
+    ByteSpan VSection = VSpan.section(i * ScLen, ScLen).shift(-i * ScLen);
+    Value *AccumV = UndefValue::get(SecTy);
+    Value *AccumM = HVC.getNullValue(SecTy);
+    for (ByteSpan::Block &S : VSection) {
+      Value *Pay = getPayload(S.Seg.Val);
+      Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),
+                                Pay->getType(), HVC.getByteTy());
+      AccumM = HVC.insertb(Builder, AccumM, HVC.vbytes(Builder, Mask),
+                           S.Seg.Start, S.Seg.Size, S.Pos);
+      AccumV = HVC.insertb(Builder, AccumV, HVC.vbytes(Builder, Pay),
+                           S.Seg.Start, S.Seg.Size, S.Pos);
+    }
+    ASpanV.Blocks.emplace_back(AccumV, ScLen, i * ScLen);
+    ASpanM.Blocks.emplace_back(AccumM, ScLen, i * ScLen);
+  }
+
+  // vlalign
+  if (DoAlign) {
+    for (int j = 1; j != NumSectors + 2; ++j) {
+      Value *PrevV = ASpanV[j - 1].Seg.Val, *ThisV = ASpanV[j].Seg.Val;
+      Value *PrevM = ASpanM[j - 1].Seg.Val, *ThisM = ASpanM[j].Seg.Val;
+      assert(isSectorTy(PrevV->getType()) && isSectorTy(PrevM->getType()));
+      ASpanV[j - 1].Seg.Val = HVC.vlalignb(Builder, PrevV, ThisV, AlignVal);
+      ASpanM[j - 1].Seg.Val = HVC.vlalignb(Builder, PrevM, ThisM, AlignVal);
+    }
+  }
+
+  for (int i = 0; i != NumSectors + DoAlign; ++i) {
+    Value *Ptr = createAdjustedPointer(Builder, AlignAddr, SecTy, i * ScLen);
+    Value *Val = ASpanV[i].Seg.Val;
+    Value *Mask = ASpanM[i].Seg.Val; // bytes
+    if (!HVC.isUndef(Val) && !HVC.isZero(Mask)) {
+      Value *Store =
+          createAlignedStore(Builder, Val, Ptr, ScLen, HVC.vlsb(Builder, Mask));
+      // If vector shifting is potentially needed, accumulate metadata
+      // from source sections of twice the store width.
+      int Start = (i - DoAlign) * ScLen;
+      int Width = (1 + DoAlign) * ScLen;
+      propagateMetadata(cast<Instruction>(Store),
+                        VSpan.section(Start, Width).values());
+    }
+  }
+}
+
 auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
   // TODO: Needs support for masked loads/stores of "scalar" vectors.
   if (!Move.IsHvx)
@@ -822,9 +1074,18 @@ auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
       getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.NeedAlign; });
   Align MinNeeded = WithMaxNeeded.NeedAlign;
 
-  // Set the builder at the top instruction in the move group.
-  Instruction *TopIn = Move.IsLoad ? Move.Main.front() : Move.Main.back();
-  IRBuilder<> Builder(TopIn);
+  // Set the builder's insertion point right before the load group, or
+  // immediately after the store group. (Instructions in a store group are
+  // listed in reverse order.)
+  Instruction *InsertAt = Move.Main.front();
+  if (!Move.IsLoad) {
+    // There should be a terminator (which store isn't, but check anyways).
+    assert(InsertAt->getIterator() != InsertAt->getParent()->end());
+    InsertAt = &*std::next(InsertAt->getIterator());
+  }
+
+  IRBuilder Builder(InsertAt->getParent(), InsertAt->getIterator(),
+                    InstSimplifyFolder(HVC.DL));
   Value *AlignAddr = nullptr; // Actual aligned address.
   Value *AlignVal = nullptr;  // Right-shift amount (for valign).
 
@@ -871,118 +1132,10 @@ auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
   assert(!Move.IsHvx || ScLen == 64 || ScLen == 128);
   assert(Move.IsHvx || ScLen == 4 || ScLen == 8);
 
-  Type *SecTy = HVC.getByteTy(ScLen);
-  int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
-  bool DoAlign = !HVC.isZero(AlignVal);
-
-  if (Move.IsLoad) {
-    ByteSpan ASpan;
-    auto *True = HVC.getFullValue(HVC.getBoolTy(ScLen));
-    auto *Undef = UndefValue::get(SecTy);
-
-    for (int i = 0; i != NumSectors + DoAlign; ++i) {
-      Value *Ptr = createAdjustedPointer(Builder, AlignAddr, SecTy, i * ScLen);
-      // FIXME: generate a predicated load?
-      Value *Load = createAlignedLoad(Builder, SecTy, Ptr, ScLen, True, Undef);
-      // If vector shifting is potentially needed, accumulate metadata
-      // from source sections of twice the load width.
-      int Start = (i - DoAlign) * ScLen;
-      int Width = (1 + DoAlign) * ScLen;
-      propagateMetadata(cast<Instruction>(Load),
-                        VSpan.section(Start, Width).values());
-      ASpan.Blocks.emplace_back(Load, ScLen, i * ScLen);
-    }
-
-    if (DoAlign) {
-      for (int j = 0; j != NumSectors; ++j) {
-        assert(isSectorTy(ASpan[j].Seg.Val->getType()));
-        ASpan[j].Seg.Val = HVC.vralignb(Builder, ASpan[j].Seg.Val,
-                                        ASpan[j + 1].Seg.Val, AlignVal);
-      }
-    }
-
-    for (ByteSpan::Block &B : VSpan) {
-      ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
-      Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
-      for (ByteSpan::Block &S : ASection) {
-        Value *Pay = HVC.vbytes(Builder, getPayload(S.Seg.Val));
-        Accum =
-            HVC.insertb(Builder, Accum, Pay, S.Seg.Start, S.Seg.Size, S.Pos);
-      }
-      // Instead of casting everything to bytes for the vselect, cast to the
-      // original value type. This will avoid complications with casting masks.
-      // For example, in cases when the original mask applied to i32, it could
-      // be converted to a mask applicable to i8 via pred_typecast intrinsic,
-      // but if the mask is not exactly of HVX length, extra handling would be
-      // needed to make it work.
-      Type *ValTy = getPayload(B.Seg.Val)->getType();
-      Value *Cast = Builder.CreateBitCast(Accum, ValTy);
-      Value *Sel = Builder.CreateSelect(getMask(B.Seg.Val), Cast,
-                                        getPassThrough(B.Seg.Val));
-      B.Seg.Val->replaceAllUsesWith(Sel);
-    }
-  } else {
-    // Stores.
-    ByteSpan ASpanV, ASpanM;
-
-    // Return a vector value corresponding to the input value Val:
-    // either <1 x Val> for scalar Val, or Val itself for vector Val.
-    auto MakeVec = [](IRBuilderBase &Builder, Value *Val) -> Value * {
-      Type *Ty = Val->getType();
-      if (Ty->isVectorTy())
-        return Val;
-      auto *VecTy = VectorType::get(Ty, 1, /*Scalable=*/false);
-      return Builder.CreateBitCast(Val, VecTy);
-    };
-
-    // Create an extra "undef" sector at the beginning and at the end.
-    // They will be used as the left/right filler in the vlalign step.
-    for (int i = (DoAlign ? -1 : 0); i != NumSectors + DoAlign; ++i) {
-      // For stores, the size of each section is an aligned vector length.
-      // Adjust the store offsets relative to the section start offset.
-      ByteSpan VSection = VSpan.section(i * ScLen, ScLen).shift(-i * ScLen);
-      Value *AccumV = UndefValue::get(SecTy);
-      Value *AccumM = HVC.getNullValue(SecTy);
-      for (ByteSpan::Block &S : VSection) {
-        Value *Pay = getPayload(S.Seg.Val);
-        Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),
-                                  Pay->getType(), HVC.getByteTy());
-        AccumM = HVC.insertb(Builder, AccumM, HVC.vbytes(Builder, Mask),
-                             S.Seg.Start, S.Seg.Size, S.Pos);
-        AccumV = HVC.insertb(Builder, AccumV, HVC.vbytes(Builder, Pay),
-                             S.Seg.Start, S.Seg.Size, S.Pos);
-      }
-      ASpanV.Blocks.emplace_back(AccumV, ScLen, i * ScLen);
-      ASpanM.Blocks.emplace_back(AccumM, ScLen, i * ScLen);
-    }
-
-    // vlalign
-    if (DoAlign) {
-      for (int j = 1; j != NumSectors + 2; ++j) {
-        Value *PrevV = ASpanV[j - 1].Seg.Val, *ThisV = ASpanV[j].Seg.Val;
-        Value *PrevM = ASpanM[j - 1].Seg.Val, *ThisM = ASpanM[j].Seg.Val;
-        assert(isSectorTy(PrevV->getType()) && isSectorTy(PrevM->getType()));
-        ASpanV[j - 1].Seg.Val = HVC.vlalignb(Builder, PrevV, ThisV, AlignVal);
-        ASpanM[j - 1].Seg.Val = HVC.vlalignb(Builder, PrevM, ThisM, AlignVal);
-      }
-    }
-
-    for (int i = 0; i != NumSectors + DoAlign; ++i) {
-      Value *Ptr = createAdjustedPointer(Builder, AlignAddr, SecTy, i * ScLen);
-      Value *Val = ASpanV[i].Seg.Val;
-      Value *Mask = ASpanM[i].Seg.Val; // bytes
-      if (!HVC.isUndef(Val) && !HVC.isZero(Mask)) {
-        Value *Store = createAlignedStore(Builder, Val, Ptr, ScLen,
-                                          HVC.vlsb(Builder, Mask));
-        // If vector shifting is potentially needed, accumulate metadata
-        // from source sections of twice the store width.
-        int Start = (i - DoAlign) * ScLen;
-        int Width = (1 + DoAlign) * ScLen;
-        propagateMetadata(cast<Instruction>(Store),
-                          VSpan.section(Start, Width).values());
-      }
-    }
-  }
+  if (Move.IsLoad)
+    realignLoadGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
+  else
+    realignStoreGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
 
   for (auto *Inst : Move.Main)
     Inst->eraseFromParent();
@@ -2064,7 +2217,7 @@ auto HexagonVectorCombine::getKnownBits(const Value *V,
 template <typename T>
 auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
                                                   BasicBlock::const_iterator To,
-                                                  const T &Ignore) const
+                                                  const T &IgnoreInsts) const
     -> bool {
   auto getLocOrNone = [this](const Instruction &I) -> Optional<MemoryLocation> {
     if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {
@@ -2098,7 +2251,7 @@ auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
       MoveUp ? std::make_pair(To, From) : std::make_pair(std::next(From), To);
   for (auto It = Range.first; It != Range.second; ++It) {
     const Instruction &I = *It;
-    if (llvm::is_contained(Ignore, &I))
+    if (llvm::is_contained(IgnoreInsts, &I))
       continue;
     // assume intrinsic can be ignored
     if (auto *II = dyn_cast<IntrinsicInst>(&I)) {

diff  --git a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-interleaved.ll b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-interleaved.ll
new file mode 100644
index 0000000000000..415c717586c76
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-interleaved.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; In this testcase, when loads were moved close to users, they were actualy
+; moved right before the consuming stores. This was after the store group
+; was moved, so the loads and stores ended up being interleaved. This violated
+; the assumption in store realigning that all loads were available before the
+; first store, causing some code depending on the loads being inserted before
+; the load used.
+; Just make sure that this compiles ok.
+
+; Function Attrs: nounwind
+define void @f0(i16* noalias nocapture readonly %a0, i16* noalias nocapture %a1, i32 %a2) #0 {
+; CHECK-LABEL: f0:
+; CHECK:       // %bb.0: // %b0
+; CHECK-NEXT:    {
+; CHECK-NEXT:     p0 = cmp.eq(r2,#0)
+; CHECK-NEXT:     if (p0.new) jumpr:nt r31
+; CHECK-NEXT:    }
+; CHECK-NEXT:    .p2align 4
+; CHECK-NEXT:  .LBB0_1: // %b2
+; CHECK-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v0.cur = vmem(r0+#0)
+; CHECK-NEXT:     vmem(r1+#0) = v0
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v29.cur = vmem(r0+#1)
+; CHECK-NEXT:     vmem(r1+#1) = v29
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     v30.cur = vmem(r0+#2)
+; CHECK-NEXT:     vmem(r1+#2) = v30
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     r0 = add(r0,#256)
+; CHECK-NEXT:     r1 = add(r1,#256)
+; CHECK-NEXT:     v31.cur = vmem(r0+#3)
+; CHECK-NEXT:     vmem(r1+#3) = v31
+; CHECK-NEXT:    }
+; CHECK-NEXT:    {
+; CHECK-NEXT:     jump .LBB0_1
+; CHECK-NEXT:    }
+b0:
+  %v0 = icmp eq i32 %a2, 0
+  br i1 %v0, label %b3, label %b1
+
+b1:                                               ; preds = %b0
+  %v1 = bitcast i16* %a1 to <16 x i32>*
+  %v2 = bitcast i16* %a0 to <16 x i32>*
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v3 = phi <16 x i32>* [ %v16, %b2 ], [ %v1, %b1 ]
+  %v4 = phi <16 x i32>* [ %v11, %b2 ], [ %v2, %b1 ]
+  %v5 = getelementptr inbounds <16 x i32>, <16 x i32>* %v4, i32 1
+  %v6 = load <16 x i32>, <16 x i32>* %v4, align 64
+  %v7 = getelementptr inbounds <16 x i32>, <16 x i32>* %v4, i32 2
+  %v8 = load <16 x i32>, <16 x i32>* %v5, align 64
+  %v9 = getelementptr inbounds <16 x i32>, <16 x i32>* %v4, i32 3
+  %v10 = load <16 x i32>, <16 x i32>* %v7, align 64
+  %v11 = getelementptr inbounds <16 x i32>, <16 x i32>* %v4, i32 4
+  %v12 = load <16 x i32>, <16 x i32>* %v9, align 64
+  %v13 = getelementptr inbounds <16 x i32>, <16 x i32>* %v3, i32 1
+  store <16 x i32> %v6, <16 x i32>* %v3, align 64
+  %v14 = getelementptr inbounds <16 x i32>, <16 x i32>* %v3, i32 2
+  store <16 x i32> %v8, <16 x i32>* %v13, align 64
+  %v15 = getelementptr inbounds <16 x i32>, <16 x i32>* %v3, i32 3
+  store <16 x i32> %v10, <16 x i32>* %v14, align 64
+  %v16 = getelementptr inbounds <16 x i32>, <16 x i32>* %v3, i32 4
+  store <16 x i32> %v12, <16 x i32>* %v15, align 64
+  br label %b2
+
+b3:                                               ; preds = %b0
+  ret void
+}
+
+attributes #0 = { nounwind "target-features"="+hvxv65,+hvx-length64b" }

diff  --git a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-only-phi-use.ll b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-only-phi-use.ll
new file mode 100644
index 0000000000000..dc83fc9be2df2
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-only-phi-use.ll
@@ -0,0 +1,71 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; One of the loads is only used in a PHI instruction. Make sure the PHI use
+; still counts as a user of the load (and that the load is not removed).
+
+; CHECK-LABEL: f0:
+; CHECK: = vmem({{.*}})
+; CHECK: = vmem({{.*}})
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+declare <16 x i32> @llvm.hexagon.V6.vmux(<64 x i1>, <16 x i32>, <16 x i32>) #0
+declare <16 x i32> @llvm.hexagon.V6.vmpyiewuh.acc(<16 x i32>, <16 x i32>, <16 x i32>) #0
+declare <16 x i32> @llvm.hexagon.V6.vand(<16 x i32>, <16 x i32>) #0
+declare <64 x i1> @llvm.hexagon.V6.vgtuw.and(<64 x i1>, <16 x i32>, <16 x i32>) #0
+declare <64 x i1> @llvm.hexagon.V6.pred.or(<64 x i1>, <64 x i1>) #0
+
+define <16 x i32> @f0(i8* %a0, i32 %a1) local_unnamed_addr #1 {
+b0:
+  %v0 = getelementptr inbounds i8, i8* %a0, i32 576
+  %v1 = bitcast i8* %a0 to <16 x i32>*
+  %v2 = bitcast i8* %v0 to <16 x i32>*
+  br label %b1
+
+b1:                                               ; preds = %b4, %b0
+  %v3 = phi i32 [ 0, %b0 ], [ %v23, %b4 ]
+  %v4 = phi <16 x i32> [ poison, %b0 ], [ %v22, %b4 ]
+  br i1 poison, label %b2, label %b3
+
+b2:                                               ; preds = %b1
+  %v5 = getelementptr inbounds <16 x i32>, <16 x i32>* %v1, i32 %v3
+  %v6 = load <16 x i32>, <16 x i32>* %v5, align 64
+  %v7 = getelementptr inbounds <16 x i32>, <16 x i32>* %v2, i32 %v3
+  %v8 = load <16 x i32>, <16 x i32>* %v7, align 64
+  %v9 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiewuh.acc(<16 x i32> poison, <16 x i32> %v6, <16 x i32> %v6)
+  br label %b4
+
+b3:                                               ; preds = %b1
+  br label %b4
+
+b4:                                               ; preds = %b3, %b2
+  %v10 = phi <16 x i32> [ %v9, %b2 ], [ poison, %b3 ]
+  %v11 = phi <16 x i32> [ %v8, %b2 ], [ poison, %b3 ]
+  %v12 = tail call <16 x i32> @llvm.hexagon.V6.vmux(<64 x i1> poison, <16 x i32> %v10, <16 x i32> %v4)
+  %v13 = tail call <16 x i32> @llvm.hexagon.V6.vmux(<64 x i1> poison, <16 x i32> %v11, <16 x i32> poison)
+  %v14 = or i32 %v3, 1
+  %v15 = getelementptr inbounds <16 x i32>, <16 x i32>* %v2, i32 %v14
+  %v16 = load <16 x i32>, <16 x i32>* %v15, align 64
+  %v17 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiewuh.acc(<16 x i32> poison, <16 x i32> %v13, <16 x i32> poison)
+  %v18 = tail call <16 x i32> @llvm.hexagon.V6.vand(<16 x i32> %v12, <16 x i32> poison)
+  %v19 = tail call <64 x i1> @llvm.hexagon.V6.vgtuw.and(<64 x i1> poison, <16 x i32> %v17, <16 x i32> poison)
+  %v20 = tail call <64 x i1> @llvm.hexagon.V6.pred.or(<64 x i1> %v19, <64 x i1> poison)
+  %v21 = tail call <64 x i1> @llvm.hexagon.V6.pred.or(<64 x i1> %v20, <64 x i1> poison)
+  %v22 = tail call <16 x i32> @llvm.hexagon.V6.vmux(<64 x i1> %v21, <16 x i32> poison, <16 x i32> %v12)
+  %v23 = add nuw nsw i32 %v3, 2
+  %v24 = icmp slt i32 %v23, %a1
+  br i1 %v24, label %b1, label %b5, !llvm.loop !1
+
+b5:                                               ; preds = %b4
+  ret <16 x i32> %v22
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #1 = { "target-features"="+hvx-length64b,+hvxv65,+v65,-long-calls" }
+
+!llvm.linker.options = !{!0}
+
+!0 = !{!"$.str.3", !".rodata.str1.1"}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}

diff  --git a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
index 9d8074177a1d3..96bf53e0610cd 100644
--- a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
+++ b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
@@ -28,11 +28,11 @@ define <64 x i16> @f0(i16* %a0, i32 %a1) #0 {
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <32 x i32>, <32 x i32>* [[TMP10]], align 128, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP12]], <32 x i32> [[TMP5]], i32 [[TMP3]])
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP11]], <32 x i32> [[TMP14]], i32 [[TMP3]])
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <32 x i32> [[TMP13]] to <64 x i16>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i32> [[TMP15]] to <64 x i16>
-; CHECK-NEXT:    [[V8:%.*]] = add <64 x i16> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <32 x i32> [[TMP13]] to <64 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP11]], <32 x i32> [[TMP15]], i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i32> [[TMP16]] to <64 x i16>
+; CHECK-NEXT:    [[V8:%.*]] = add <64 x i16> [[TMP14]], [[TMP17]]
 ; CHECK-NEXT:    ret <64 x i16> [[V8]]
 ;
 b0:
@@ -69,11 +69,11 @@ define <64 x i16> @f1(i16* %a0, i32 %a1) #0 {
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <32 x i32>, <32 x i32>* [[TMP10]], align 128
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP12]], <32 x i32> [[TMP5]], i32 [[TMP3]])
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP11]], <32 x i32> [[TMP14]], i32 [[TMP3]])
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <32 x i32> [[TMP13]] to <64 x i16>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i32> [[TMP15]] to <64 x i16>
-; CHECK-NEXT:    [[V8:%.*]] = add <64 x i16> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <32 x i32> [[TMP13]] to <64 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP11]], <32 x i32> [[TMP15]], i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i32> [[TMP16]] to <64 x i16>
+; CHECK-NEXT:    [[V8:%.*]] = add <64 x i16> [[TMP14]], [[TMP17]]
 ; CHECK-NEXT:    ret <64 x i16> [[V8]]
 ;
 b0:
@@ -110,11 +110,11 @@ define <64 x i16> @f2(i16* %a0, i32 %a1) #0 {
 ; CHECK-NEXT:    [[TMP11:%.*]] = load <32 x i32>, <32 x i32>* [[TMP10]], align 128, !tbaa [[TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP12]], <32 x i32> [[TMP5]], i32 [[TMP3]])
-; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
-; CHECK-NEXT:    [[TMP15:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP11]], <32 x i32> [[TMP14]], i32 [[TMP3]])
-; CHECK-NEXT:    [[TMP16:%.*]] = bitcast <32 x i32> [[TMP13]] to <64 x i16>
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i32> [[TMP15]] to <64 x i16>
-; CHECK-NEXT:    [[V8:%.*]] = add <64 x i16> [[TMP16]], [[TMP17]]
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast <32 x i32> [[TMP13]] to <64 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
+; CHECK-NEXT:    [[TMP16:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP11]], <32 x i32> [[TMP15]], i32 [[TMP3]])
+; CHECK-NEXT:    [[TMP17:%.*]] = bitcast <32 x i32> [[TMP16]] to <64 x i16>
+; CHECK-NEXT:    [[V8:%.*]] = add <64 x i16> [[TMP14]], [[TMP17]]
 ; CHECK-NEXT:    ret <64 x i16> [[V8]]
 ;
 b0:

diff  --git a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-use-in-
diff erent-block.ll b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-use-in-
diff erent-block.ll
new file mode 100644
index 0000000000000..74405c0f141d4
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-use-in-
diff erent-block.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; This used to crash because of calling isSafeToMoveBeforeInBB with source
+; and target in 
diff erent blocks.
+; Check that this compiles successfully, and that two loads are created
+; (for users in a 
diff erent block).
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1124:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local <32 x i32> @f0(i32 %a0, i32 %a1) local_unnamed_addr #0 {
+; CHECK-LABEL: f0:
+; CHECK:     = vmem({{.*}})
+; CHECK:     = vmem({{.*}})
+b0:
+  br label %b1
+
+b1:                                               ; preds = %b0
+  %v0 = mul nsw i32 -4, %a0
+  %v1 = getelementptr inbounds i8, i8* null, i32 %v0
+  %v2 = getelementptr inbounds i8, i8* %v1, i32 -64
+  %v3 = bitcast i8* %v2 to <16 x i32>*
+  %v4 = load <16 x i32>, <16 x i32>* %v3, align 64
+  %v5 = getelementptr inbounds i8, i8* %v1, i32 64
+  %v6 = bitcast i8* %v5 to <16 x i32>*
+  %v7 = load <16 x i32>, <16 x i32>* %v6, align 64
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v8 = phi <32 x i32> [ poison, %b1 ], [ %v17, %b2 ]
+  %v9 = phi i32 [ %a1, %b1 ], [ %v18, %b2 ]
+  %v10 = tail call <16 x i32> @llvm.hexagon.V6.vlalignb(<16 x i32> poison, <16 x i32> %v4, i32 poison)
+  %v11 = tail call <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32> %v7, <16 x i32> poison, i32 poison)
+  %v12 = tail call <32 x i32> @llvm.hexagon.V6.vmpyubv(<16 x i32> %v10, <16 x i32> poison)
+  %v13 = tail call <32 x i32> @llvm.hexagon.V6.vmpyubv(<16 x i32> %v11, <16 x i32> poison)
+  %v14 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v12)
+  %v15 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v13)
+  %v16 = tail call <32 x i32> @llvm.hexagon.V6.vadduhw(<16 x i32> %v14, <16 x i32> %v15)
+  %v17 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v8, <32 x i32> %v16)
+  %v18 = add nsw i32 %v9, -1
+  %v19 = icmp ugt i32 %v18, 1
+  br i1 %v19, label %b2, label %b3
+
+b3:                                               ; preds = %b2
+  %v20 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v17, <32 x i32> %v16)
+  ret <32 x i32> %v20
+}
+
+declare <16 x i32> @llvm.hexagon.V6.hi(<32 x i32>) #1
+declare <16 x i32> @llvm.hexagon.V6.vlalignb(<16 x i32>, <16 x i32>, i32) #1
+declare <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32>, <16 x i32>, i32) #1
+declare <32 x i32> @llvm.hexagon.V6.vmpyubv(<16 x i32>, <16 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32>, <32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vadduhw(<16 x i32>, <16 x i32>) #1
+
+attributes #0 = { "target-features"="+hvx-length64b,+hvxv66,+v66,-long-calls" }
+attributes #1 = { nounwind memory(none) }