[llvm] 0171862 - [Hexagon] Place aligned loads closer to users
Krzysztof Parzyszek via llvm-commits
llvm-commits at lists.llvm.org
Fri Nov 11 12:04:52 PST 2022
Author: Krzysztof Parzyszek
Date: 2022-11-11T12:04:33-08:00
New Revision: 017186294d933921d1667ffbad82981dda09dacd
URL: https://github.com/llvm/llvm-project/commit/017186294d933921d1667ffbad82981dda09dacd
DIFF: https://github.com/llvm/llvm-project/commit/017186294d933921d1667ffbad82981dda09dacd.diff
LOG: [Hexagon] Place aligned loads closer to users
Vector alignment code was grouping all aligned loads together. In some
cases the groups could become quite large causing a lot of spill to be
generated. This will place the loads closer to where they are used,
reducing the register pressure.
Added:
llvm/test/CodeGen/Hexagon/autohvx/vector-align-interleaved.ll
llvm/test/CodeGen/Hexagon/autohvx/vector-align-only-phi-use.ll
llvm/test/CodeGen/Hexagon/autohvx/vector-align-use-in-different-block.ll
Modified:
llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
Removed:
################################################################################
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 4efea50ea7698..279ce3483ee68 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -141,7 +141,7 @@ class HexagonVectorCombine {
template <typename T = std::vector<Instruction *>>
bool isSafeToMoveBeforeInBB(const Instruction &In,
BasicBlock::const_iterator To,
- const T &Ignore = {}) const;
+ const T &IgnoreInsts = {}) const;
// This function is only used for assertions at the moment.
[[maybe_unused]] bool isByteVecTy(Type *Ty) const;
@@ -271,14 +271,20 @@ class AlignVectors {
Value *createAlignedStore(IRBuilderBase &Builder, Value *Val, Value *Ptr,
int Alignment, Value *Mask) const;
+ DepList getUpwardDeps(Instruction *In, Instruction *Base) const;
bool createAddressGroups();
MoveList createLoadGroups(const AddrList &Group) const;
MoveList createStoreGroups(const AddrList &Group) const;
bool move(const MoveGroup &Move) const;
+ void realignLoadGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
+ int ScLen, Value *AlignVal, Value *AlignAddr) const;
+ void realignStoreGroup(IRBuilderBase &Builder, const ByteSpan &VSpan,
+ int ScLen, Value *AlignVal, Value *AlignAddr) const;
bool realignGroup(const MoveGroup &Move) const;
friend raw_ostream &operator<<(raw_ostream &OS, const AddrInfo &AI);
friend raw_ostream &operator<<(raw_ostream &OS, const MoveGroup &MG);
+ friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan::Block &B);
friend raw_ostream &operator<<(raw_ostream &OS, const ByteSpan &BS);
std::map<Instruction *, AddrList> AddrGroups;
@@ -307,13 +313,19 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::MoveGroup &MG) {
return OS;
}
+LLVM_ATTRIBUTE_UNUSED
+raw_ostream &operator<<(raw_ostream &OS,
+ const AlignVectors::ByteSpan::Block &B) {
+ OS << " @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] "
+ << *B.Seg.Val;
+ return OS;
+}
+
LLVM_ATTRIBUTE_UNUSED
raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
OS << "ByteSpan[size=" << BS.size() << ", extent=" << BS.extent() << '\n';
- for (const AlignVectors::ByteSpan::Block &B : BS) {
- OS << " @" << B.Pos << " [" << B.Seg.Start << ',' << B.Seg.Size << "] "
- << *B.Seg.Val << '\n';
- }
+ for (const AlignVectors::ByteSpan::Block &B : BS)
+ OS << B << '\n';
OS << ']';
return OS;
}
@@ -582,6 +594,29 @@ auto AlignVectors::createAlignedStore(IRBuilderBase &Builder, Value *Val,
return Builder.CreateMaskedStore(Val, Ptr, Align(Alignment), Mask);
}
+auto AlignVectors::getUpwardDeps(Instruction *In, Instruction *Base) const
+ -> DepList {
+ BasicBlock *Parent = Base->getParent();
+ assert(In->getParent() == Parent &&
+ "Base and In should be in the same block");
+ assert(Base->comesBefore(In) && "Base should come before In");
+
+ DepList Deps;
+ std::deque<Instruction *> WorkQ = {In};
+ while (!WorkQ.empty()) {
+ Instruction *D = WorkQ.front();
+ WorkQ.pop_front();
+ Deps.insert(D);
+ for (Value *Op : D->operands()) {
+ if (auto *I = dyn_cast<Instruction>(Op)) {
+ if (I->getParent() == Parent && Base->comesBefore(I))
+ WorkQ.push_back(I);
+ }
+ }
+ }
+ return Deps;
+}
+
auto AlignVectors::createAddressGroups() -> bool {
// An address group created here may contain instructions spanning
// multiple basic blocks.
@@ -641,28 +676,6 @@ auto AlignVectors::createLoadGroups(const AddrList &Group) const -> MoveList {
// To avoid complications with moving code across basic blocks, only form
// groups that are contained within a single basic block.
- auto getUpwardDeps = [](Instruction *In, Instruction *Base) {
- BasicBlock *Parent = Base->getParent();
- assert(In->getParent() == Parent &&
- "Base and In should be in the same block");
- assert(Base->comesBefore(In) && "Base should come before In");
-
- DepList Deps;
- std::deque<Instruction *> WorkQ = {In};
- while (!WorkQ.empty()) {
- Instruction *D = WorkQ.front();
- WorkQ.pop_front();
- Deps.insert(D);
- for (Value *Op : D->operands()) {
- if (auto *I = dyn_cast<Instruction>(Op)) {
- if (I->getParent() == Parent && Base->comesBefore(I))
- WorkQ.push_back(I);
- }
- }
- }
- return Deps;
- };
-
auto tryAddTo = [&](const AddrInfo &Info, MoveGroup &Move) {
assert(!Move.Main.empty() && "Move group should have non-empty Main");
// Don't mix HVX and non-HVX instructions.
@@ -775,6 +788,245 @@ auto AlignVectors::move(const MoveGroup &Move) const -> bool {
return Move.Main.size() + Move.Deps.size() > 1;
}
+auto AlignVectors::realignLoadGroup(IRBuilderBase &Builder,
+ const ByteSpan &VSpan, int ScLen,
+ Value *AlignVal, Value *AlignAddr) const
+ -> void {
+ Type *SecTy = HVC.getByteTy(ScLen);
+ int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
+ bool DoAlign = !HVC.isZero(AlignVal);
+ BasicBlock::iterator BasePos = Builder.GetInsertPoint();
+ BasicBlock *BaseBlock = Builder.GetInsertBlock();
+
+ ByteSpan ASpan;
+ auto *True = HVC.getFullValue(HVC.getBoolTy(ScLen));
+ auto *Undef = UndefValue::get(SecTy);
+
+ SmallVector<Instruction *> Loads(NumSectors + DoAlign, nullptr);
+
+ // We could create all of the aligned loads, and generate the valigns
+ // at the location of the first load, but for large load groups, this
+ // could create highly suboptimal code (there have been groups of 140+
+ // loads in real code).
+ // Instead, place the loads/valigns as close to the users as possible.
+ // In any case we need to have a mapping from the blocks of VSpan (the
+ // span covered by the pre-existing loads) to ASpan (the span covered
+ // by the aligned loads). There is a small problem, though: ASpan needs
+ // to have pointers to the loads/valigns, but we don't know where to put
+ // them yet. We can't use nullptr, because when we create sections of
+ // ASpan (corresponding to blocks from VSpan), for each block in the
+ // section we need to know which blocks of ASpan they are a part of.
+ // To have 1-1 mapping between blocks of ASpan and the temporary value
+ // pointers, use the addresses of the blocks themselves.
+
+ // Populate the blocks first, to avoid reallocations of the vector
+ // interfering with generating the placeholder addresses.
+ for (int Index = 0; Index != NumSectors; ++Index)
+ ASpan.Blocks.emplace_back(nullptr, ScLen, Index * ScLen);
+ for (int Index = 0; Index != NumSectors; ++Index) {
+ ASpan.Blocks[Index].Seg.Val =
+ reinterpret_cast<Value *>(&ASpan.Blocks[Index]);
+ }
+
+ // Multiple values from VSpan can map to the same value in ASpan. Since we
+ // try to create loads lazily, we need to find the earliest use for each
+ // value from ASpan.
+ DenseMap<void *, Instruction *> EarliestUser;
+ auto isEarlier = [](Instruction *A, Instruction *B) {
+ if (B == nullptr)
+ return true;
+ if (A == nullptr)
+ return false;
+ assert(A->getParent() == B->getParent());
+ return A->comesBefore(B);
+ };
+ auto earliestUser = [&](const auto &Uses) {
+ Instruction *User = nullptr;
+ for (const Use &U : Uses) {
+ auto *I = dyn_cast<Instruction>(U.getUser());
+ assert(I != nullptr && "Load used in a non-instruction?");
+ // Make sure we only consider at users in this block, but we need
+ // to remember if there were users outside the block too. This is
+ // because if there are no users, aligned loads will not be created.
+ if (I->getParent() == BaseBlock) {
+ if (!isa<PHINode>(I))
+ User = std::min(User, I, isEarlier);
+ } else {
+ User = std::min(User, BaseBlock->getTerminator(), isEarlier);
+ }
+ }
+ return User;
+ };
+
+ for (const ByteSpan::Block &B : VSpan) {
+ ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size);
+ for (const ByteSpan::Block &S : ASection) {
+ EarliestUser[S.Seg.Val] = std::min(
+ EarliestUser[S.Seg.Val], earliestUser(B.Seg.Val->uses()), isEarlier);
+ }
+ }
+
+ auto createLoad = [&](IRBuilderBase &Builder, const ByteSpan &VSpan,
+ int Index) {
+ Value *Ptr =
+ createAdjustedPointer(Builder, AlignAddr, SecTy, Index * ScLen);
+ // FIXME: generate a predicated load?
+ Value *Load = createAlignedLoad(Builder, SecTy, Ptr, ScLen, True, Undef);
+ // If vector shifting is potentially needed, accumulate metadata
+ // from source sections of twice the load width.
+ int Start = (Index - DoAlign) * ScLen;
+ int Width = (1 + DoAlign) * ScLen;
+ propagateMetadata(cast<Instruction>(Load),
+ VSpan.section(Start, Width).values());
+ return cast<Instruction>(Load);
+ };
+
+ auto moveBefore = [this](Instruction *In, Instruction *To) {
+ // Move In and its upward dependencies to before To.
+ assert(In->getParent() == To->getParent());
+ DepList Deps = getUpwardDeps(In, To);
+ // DepList is sorted with respect to positions in the basic block.
+ for (Instruction *I : Deps)
+ I->moveBefore(To);
+ };
+
+ // Generate necessary loads at appropriate locations.
+ for (int Index = 0; Index != NumSectors + 1; ++Index) {
+ // In ASpan, each block will be either a single aligned load, or a
+ // valign of a pair of loads. In the latter case, an aligned load j
+ // will belong to the current valign, and the one in the previous
+ // block (for j > 0).
+ Instruction *PrevAt =
+ DoAlign && Index > 0 ? EarliestUser[&ASpan[Index - 1]] : nullptr;
+ Instruction *ThisAt =
+ Index < NumSectors ? EarliestUser[&ASpan[Index]] : nullptr;
+ if (auto *Where = std::min(PrevAt, ThisAt, isEarlier)) {
+ Builder.SetInsertPoint(Where);
+ Loads[Index] = createLoad(Builder, VSpan, Index);
+ // We know it's safe to put the load at BasePos, so if it's not safe
+ // to move it from this location to BasePos, then the current location
+ // is not valid.
+ // We can't do this check proactively because we need the load to exist
+ // in order to check legality.
+ if (!HVC.isSafeToMoveBeforeInBB(*Loads[Index], BasePos))
+ moveBefore(Loads[Index], &*BasePos);
+ }
+ }
+ // Generate valigns if needed, and fill in proper values in ASpan
+ for (int Index = 0; Index != NumSectors; ++Index) {
+ ASpan[Index].Seg.Val = nullptr;
+ if (auto *Where = EarliestUser[&ASpan[Index]]) {
+ Builder.SetInsertPoint(Where);
+ Value *Val = Loads[Index];
+ assert(Val != nullptr);
+ if (DoAlign) {
+ Value *NextLoad = Loads[Index + 1];
+ assert(NextLoad != nullptr);
+ Val = HVC.vralignb(Builder, Val, NextLoad, AlignVal);
+ }
+ ASpan[Index].Seg.Val = Val;
+ }
+ }
+
+ for (const ByteSpan::Block &B : VSpan) {
+ ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
+ Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
+ Builder.SetInsertPoint(cast<Instruction>(B.Seg.Val));
+
+ for (ByteSpan::Block &S : ASection) {
+ if (S.Seg.Val == nullptr)
+ continue;
+ // The processing of the data loaded by the aligned loads
+ // needs to be inserted after the data is available.
+ Instruction *SegI = cast<Instruction>(S.Seg.Val);
+ Builder.SetInsertPoint(&*std::next(SegI->getIterator()));
+ Value *Pay = HVC.vbytes(Builder, getPayload(S.Seg.Val));
+ Accum = HVC.insertb(Builder, Accum, Pay, S.Seg.Start, S.Seg.Size, S.Pos);
+ }
+ // Instead of casting everything to bytes for the vselect, cast to the
+ // original value type. This will avoid complications with casting masks.
+ // For example, in cases when the original mask applied to i32, it could
+ // be converted to a mask applicable to i8 via pred_typecast intrinsic,
+ // but if the mask is not exactly of HVX length, extra handling would be
+ // needed to make it work.
+ Type *ValTy = getPayload(B.Seg.Val)->getType();
+ Value *Cast = Builder.CreateBitCast(Accum, ValTy);
+ Value *Sel = Builder.CreateSelect(getMask(B.Seg.Val), Cast,
+ getPassThrough(B.Seg.Val));
+ B.Seg.Val->replaceAllUsesWith(Sel);
+ }
+}
+
+auto AlignVectors::realignStoreGroup(IRBuilderBase &Builder,
+ const ByteSpan &VSpan, int ScLen,
+ Value *AlignVal, Value *AlignAddr) const
+ -> void {
+ Type *SecTy = HVC.getByteTy(ScLen);
+ int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
+ bool DoAlign = !HVC.isZero(AlignVal);
+
+ // Stores.
+ ByteSpan ASpanV, ASpanM;
+
+ // Return a vector value corresponding to the input value Val:
+ // either <1 x Val> for scalar Val, or Val itself for vector Val.
+ auto MakeVec = [](IRBuilderBase &Builder, Value *Val) -> Value * {
+ Type *Ty = Val->getType();
+ if (Ty->isVectorTy())
+ return Val;
+ auto *VecTy = VectorType::get(Ty, 1, /*Scalable=*/false);
+ return Builder.CreateBitCast(Val, VecTy);
+ };
+
+ // Create an extra "undef" sector at the beginning and at the end.
+ // They will be used as the left/right filler in the vlalign step.
+ for (int i = (DoAlign ? -1 : 0); i != NumSectors + DoAlign; ++i) {
+ // For stores, the size of each section is an aligned vector length.
+ // Adjust the store offsets relative to the section start offset.
+ ByteSpan VSection = VSpan.section(i * ScLen, ScLen).shift(-i * ScLen);
+ Value *AccumV = UndefValue::get(SecTy);
+ Value *AccumM = HVC.getNullValue(SecTy);
+ for (ByteSpan::Block &S : VSection) {
+ Value *Pay = getPayload(S.Seg.Val);
+ Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),
+ Pay->getType(), HVC.getByteTy());
+ AccumM = HVC.insertb(Builder, AccumM, HVC.vbytes(Builder, Mask),
+ S.Seg.Start, S.Seg.Size, S.Pos);
+ AccumV = HVC.insertb(Builder, AccumV, HVC.vbytes(Builder, Pay),
+ S.Seg.Start, S.Seg.Size, S.Pos);
+ }
+ ASpanV.Blocks.emplace_back(AccumV, ScLen, i * ScLen);
+ ASpanM.Blocks.emplace_back(AccumM, ScLen, i * ScLen);
+ }
+
+ // vlalign
+ if (DoAlign) {
+ for (int j = 1; j != NumSectors + 2; ++j) {
+ Value *PrevV = ASpanV[j - 1].Seg.Val, *ThisV = ASpanV[j].Seg.Val;
+ Value *PrevM = ASpanM[j - 1].Seg.Val, *ThisM = ASpanM[j].Seg.Val;
+ assert(isSectorTy(PrevV->getType()) && isSectorTy(PrevM->getType()));
+ ASpanV[j - 1].Seg.Val = HVC.vlalignb(Builder, PrevV, ThisV, AlignVal);
+ ASpanM[j - 1].Seg.Val = HVC.vlalignb(Builder, PrevM, ThisM, AlignVal);
+ }
+ }
+
+ for (int i = 0; i != NumSectors + DoAlign; ++i) {
+ Value *Ptr = createAdjustedPointer(Builder, AlignAddr, SecTy, i * ScLen);
+ Value *Val = ASpanV[i].Seg.Val;
+ Value *Mask = ASpanM[i].Seg.Val; // bytes
+ if (!HVC.isUndef(Val) && !HVC.isZero(Mask)) {
+ Value *Store =
+ createAlignedStore(Builder, Val, Ptr, ScLen, HVC.vlsb(Builder, Mask));
+ // If vector shifting is potentially needed, accumulate metadata
+ // from source sections of twice the store width.
+ int Start = (i - DoAlign) * ScLen;
+ int Width = (1 + DoAlign) * ScLen;
+ propagateMetadata(cast<Instruction>(Store),
+ VSpan.section(Start, Width).values());
+ }
+ }
+}
+
auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
// TODO: Needs support for masked loads/stores of "scalar" vectors.
if (!Move.IsHvx)
@@ -822,9 +1074,18 @@ auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
getMaxOf(MoveInfos, [](const AddrInfo &AI) { return AI.NeedAlign; });
Align MinNeeded = WithMaxNeeded.NeedAlign;
- // Set the builder at the top instruction in the move group.
- Instruction *TopIn = Move.IsLoad ? Move.Main.front() : Move.Main.back();
- IRBuilder<> Builder(TopIn);
+ // Set the builder's insertion point right before the load group, or
+ // immediately after the store group. (Instructions in a store group are
+ // listed in reverse order.)
+ Instruction *InsertAt = Move.Main.front();
+ if (!Move.IsLoad) {
+ // There should be a terminator (which store isn't, but check anyways).
+ assert(InsertAt->getIterator() != InsertAt->getParent()->end());
+ InsertAt = &*std::next(InsertAt->getIterator());
+ }
+
+ IRBuilder Builder(InsertAt->getParent(), InsertAt->getIterator(),
+ InstSimplifyFolder(HVC.DL));
Value *AlignAddr = nullptr; // Actual aligned address.
Value *AlignVal = nullptr; // Right-shift amount (for valign).
@@ -871,118 +1132,10 @@ auto AlignVectors::realignGroup(const MoveGroup &Move) const -> bool {
assert(!Move.IsHvx || ScLen == 64 || ScLen == 128);
assert(Move.IsHvx || ScLen == 4 || ScLen == 8);
- Type *SecTy = HVC.getByteTy(ScLen);
- int NumSectors = (VSpan.extent() + ScLen - 1) / ScLen;
- bool DoAlign = !HVC.isZero(AlignVal);
-
- if (Move.IsLoad) {
- ByteSpan ASpan;
- auto *True = HVC.getFullValue(HVC.getBoolTy(ScLen));
- auto *Undef = UndefValue::get(SecTy);
-
- for (int i = 0; i != NumSectors + DoAlign; ++i) {
- Value *Ptr = createAdjustedPointer(Builder, AlignAddr, SecTy, i * ScLen);
- // FIXME: generate a predicated load?
- Value *Load = createAlignedLoad(Builder, SecTy, Ptr, ScLen, True, Undef);
- // If vector shifting is potentially needed, accumulate metadata
- // from source sections of twice the load width.
- int Start = (i - DoAlign) * ScLen;
- int Width = (1 + DoAlign) * ScLen;
- propagateMetadata(cast<Instruction>(Load),
- VSpan.section(Start, Width).values());
- ASpan.Blocks.emplace_back(Load, ScLen, i * ScLen);
- }
-
- if (DoAlign) {
- for (int j = 0; j != NumSectors; ++j) {
- assert(isSectorTy(ASpan[j].Seg.Val->getType()));
- ASpan[j].Seg.Val = HVC.vralignb(Builder, ASpan[j].Seg.Val,
- ASpan[j + 1].Seg.Val, AlignVal);
- }
- }
-
- for (ByteSpan::Block &B : VSpan) {
- ByteSpan ASection = ASpan.section(B.Pos, B.Seg.Size).shift(-B.Pos);
- Value *Accum = UndefValue::get(HVC.getByteTy(B.Seg.Size));
- for (ByteSpan::Block &S : ASection) {
- Value *Pay = HVC.vbytes(Builder, getPayload(S.Seg.Val));
- Accum =
- HVC.insertb(Builder, Accum, Pay, S.Seg.Start, S.Seg.Size, S.Pos);
- }
- // Instead of casting everything to bytes for the vselect, cast to the
- // original value type. This will avoid complications with casting masks.
- // For example, in cases when the original mask applied to i32, it could
- // be converted to a mask applicable to i8 via pred_typecast intrinsic,
- // but if the mask is not exactly of HVX length, extra handling would be
- // needed to make it work.
- Type *ValTy = getPayload(B.Seg.Val)->getType();
- Value *Cast = Builder.CreateBitCast(Accum, ValTy);
- Value *Sel = Builder.CreateSelect(getMask(B.Seg.Val), Cast,
- getPassThrough(B.Seg.Val));
- B.Seg.Val->replaceAllUsesWith(Sel);
- }
- } else {
- // Stores.
- ByteSpan ASpanV, ASpanM;
-
- // Return a vector value corresponding to the input value Val:
- // either <1 x Val> for scalar Val, or Val itself for vector Val.
- auto MakeVec = [](IRBuilderBase &Builder, Value *Val) -> Value * {
- Type *Ty = Val->getType();
- if (Ty->isVectorTy())
- return Val;
- auto *VecTy = VectorType::get(Ty, 1, /*Scalable=*/false);
- return Builder.CreateBitCast(Val, VecTy);
- };
-
- // Create an extra "undef" sector at the beginning and at the end.
- // They will be used as the left/right filler in the vlalign step.
- for (int i = (DoAlign ? -1 : 0); i != NumSectors + DoAlign; ++i) {
- // For stores, the size of each section is an aligned vector length.
- // Adjust the store offsets relative to the section start offset.
- ByteSpan VSection = VSpan.section(i * ScLen, ScLen).shift(-i * ScLen);
- Value *AccumV = UndefValue::get(SecTy);
- Value *AccumM = HVC.getNullValue(SecTy);
- for (ByteSpan::Block &S : VSection) {
- Value *Pay = getPayload(S.Seg.Val);
- Value *Mask = HVC.rescale(Builder, MakeVec(Builder, getMask(S.Seg.Val)),
- Pay->getType(), HVC.getByteTy());
- AccumM = HVC.insertb(Builder, AccumM, HVC.vbytes(Builder, Mask),
- S.Seg.Start, S.Seg.Size, S.Pos);
- AccumV = HVC.insertb(Builder, AccumV, HVC.vbytes(Builder, Pay),
- S.Seg.Start, S.Seg.Size, S.Pos);
- }
- ASpanV.Blocks.emplace_back(AccumV, ScLen, i * ScLen);
- ASpanM.Blocks.emplace_back(AccumM, ScLen, i * ScLen);
- }
-
- // vlalign
- if (DoAlign) {
- for (int j = 1; j != NumSectors + 2; ++j) {
- Value *PrevV = ASpanV[j - 1].Seg.Val, *ThisV = ASpanV[j].Seg.Val;
- Value *PrevM = ASpanM[j - 1].Seg.Val, *ThisM = ASpanM[j].Seg.Val;
- assert(isSectorTy(PrevV->getType()) && isSectorTy(PrevM->getType()));
- ASpanV[j - 1].Seg.Val = HVC.vlalignb(Builder, PrevV, ThisV, AlignVal);
- ASpanM[j - 1].Seg.Val = HVC.vlalignb(Builder, PrevM, ThisM, AlignVal);
- }
- }
-
- for (int i = 0; i != NumSectors + DoAlign; ++i) {
- Value *Ptr = createAdjustedPointer(Builder, AlignAddr, SecTy, i * ScLen);
- Value *Val = ASpanV[i].Seg.Val;
- Value *Mask = ASpanM[i].Seg.Val; // bytes
- if (!HVC.isUndef(Val) && !HVC.isZero(Mask)) {
- Value *Store = createAlignedStore(Builder, Val, Ptr, ScLen,
- HVC.vlsb(Builder, Mask));
- // If vector shifting is potentially needed, accumulate metadata
- // from source sections of twice the store width.
- int Start = (i - DoAlign) * ScLen;
- int Width = (1 + DoAlign) * ScLen;
- propagateMetadata(cast<Instruction>(Store),
- VSpan.section(Start, Width).values());
- }
- }
- }
+ if (Move.IsLoad)
+ realignLoadGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
+ else
+ realignStoreGroup(Builder, VSpan, ScLen, AlignVal, AlignAddr);
for (auto *Inst : Move.Main)
Inst->eraseFromParent();
@@ -2064,7 +2217,7 @@ auto HexagonVectorCombine::getKnownBits(const Value *V,
template <typename T>
auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
BasicBlock::const_iterator To,
- const T &Ignore) const
+ const T &IgnoreInsts) const
-> bool {
auto getLocOrNone = [this](const Instruction &I) -> Optional<MemoryLocation> {
if (const auto *II = dyn_cast<IntrinsicInst>(&I)) {
@@ -2098,7 +2251,7 @@ auto HexagonVectorCombine::isSafeToMoveBeforeInBB(const Instruction &In,
MoveUp ? std::make_pair(To, From) : std::make_pair(std::next(From), To);
for (auto It = Range.first; It != Range.second; ++It) {
const Instruction &I = *It;
- if (llvm::is_contained(Ignore, &I))
+ if (llvm::is_contained(IgnoreInsts, &I))
continue;
// assume intrinsic can be ignored
if (auto *II = dyn_cast<IntrinsicInst>(&I)) {
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-interleaved.ll b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-interleaved.ll
new file mode 100644
index 0000000000000..415c717586c76
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-interleaved.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; In this testcase, when loads were moved close to users, they were actualy
+; moved right before the consuming stores. This was after the store group
+; was moved, so the loads and stores ended up being interleaved. This violated
+; the assumption in store realigning that all loads were available before the
+; first store, causing some code depending on the loads being inserted before
+; the load used.
+; Just make sure that this compiles ok.
+
+; Function Attrs: nounwind
+define void @f0(i16* noalias nocapture readonly %a0, i16* noalias nocapture %a1, i32 %a2) #0 {
+; CHECK-LABEL: f0:
+; CHECK: // %bb.0: // %b0
+; CHECK-NEXT: {
+; CHECK-NEXT: p0 = cmp.eq(r2,#0)
+; CHECK-NEXT: if (p0.new) jumpr:nt r31
+; CHECK-NEXT: }
+; CHECK-NEXT: .p2align 4
+; CHECK-NEXT: .LBB0_1: // %b2
+; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: {
+; CHECK-NEXT: v0.cur = vmem(r0+#0)
+; CHECK-NEXT: vmem(r1+#0) = v0
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v29.cur = vmem(r0+#1)
+; CHECK-NEXT: vmem(r1+#1) = v29
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: v30.cur = vmem(r0+#2)
+; CHECK-NEXT: vmem(r1+#2) = v30
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: r0 = add(r0,#256)
+; CHECK-NEXT: r1 = add(r1,#256)
+; CHECK-NEXT: v31.cur = vmem(r0+#3)
+; CHECK-NEXT: vmem(r1+#3) = v31
+; CHECK-NEXT: }
+; CHECK-NEXT: {
+; CHECK-NEXT: jump .LBB0_1
+; CHECK-NEXT: }
+b0:
+ %v0 = icmp eq i32 %a2, 0
+ br i1 %v0, label %b3, label %b1
+
+b1: ; preds = %b0
+ %v1 = bitcast i16* %a1 to <16 x i32>*
+ %v2 = bitcast i16* %a0 to <16 x i32>*
+ br label %b2
+
+b2: ; preds = %b2, %b1
+ %v3 = phi <16 x i32>* [ %v16, %b2 ], [ %v1, %b1 ]
+ %v4 = phi <16 x i32>* [ %v11, %b2 ], [ %v2, %b1 ]
+ %v5 = getelementptr inbounds <16 x i32>, <16 x i32>* %v4, i32 1
+ %v6 = load <16 x i32>, <16 x i32>* %v4, align 64
+ %v7 = getelementptr inbounds <16 x i32>, <16 x i32>* %v4, i32 2
+ %v8 = load <16 x i32>, <16 x i32>* %v5, align 64
+ %v9 = getelementptr inbounds <16 x i32>, <16 x i32>* %v4, i32 3
+ %v10 = load <16 x i32>, <16 x i32>* %v7, align 64
+ %v11 = getelementptr inbounds <16 x i32>, <16 x i32>* %v4, i32 4
+ %v12 = load <16 x i32>, <16 x i32>* %v9, align 64
+ %v13 = getelementptr inbounds <16 x i32>, <16 x i32>* %v3, i32 1
+ store <16 x i32> %v6, <16 x i32>* %v3, align 64
+ %v14 = getelementptr inbounds <16 x i32>, <16 x i32>* %v3, i32 2
+ store <16 x i32> %v8, <16 x i32>* %v13, align 64
+ %v15 = getelementptr inbounds <16 x i32>, <16 x i32>* %v3, i32 3
+ store <16 x i32> %v10, <16 x i32>* %v14, align 64
+ %v16 = getelementptr inbounds <16 x i32>, <16 x i32>* %v3, i32 4
+ store <16 x i32> %v12, <16 x i32>* %v15, align 64
+ br label %b2
+
+b3: ; preds = %b0
+ ret void
+}
+
+attributes #0 = { nounwind "target-features"="+hvxv65,+hvx-length64b" }
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-only-phi-use.ll b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-only-phi-use.ll
new file mode 100644
index 0000000000000..dc83fc9be2df2
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-only-phi-use.ll
@@ -0,0 +1,71 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; One of the loads is only used in a PHI instruction. Make sure the PHI use
+; still counts as a user of the load (and that the load is not removed).
+
+; CHECK-LABEL: f0:
+; CHECK: = vmem({{.*}})
+; CHECK: = vmem({{.*}})
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+declare <16 x i32> @llvm.hexagon.V6.vmux(<64 x i1>, <16 x i32>, <16 x i32>) #0
+declare <16 x i32> @llvm.hexagon.V6.vmpyiewuh.acc(<16 x i32>, <16 x i32>, <16 x i32>) #0
+declare <16 x i32> @llvm.hexagon.V6.vand(<16 x i32>, <16 x i32>) #0
+declare <64 x i1> @llvm.hexagon.V6.vgtuw.and(<64 x i1>, <16 x i32>, <16 x i32>) #0
+declare <64 x i1> @llvm.hexagon.V6.pred.or(<64 x i1>, <64 x i1>) #0
+
+define <16 x i32> @f0(i8* %a0, i32 %a1) local_unnamed_addr #1 {
+b0:
+ %v0 = getelementptr inbounds i8, i8* %a0, i32 576
+ %v1 = bitcast i8* %a0 to <16 x i32>*
+ %v2 = bitcast i8* %v0 to <16 x i32>*
+ br label %b1
+
+b1: ; preds = %b4, %b0
+ %v3 = phi i32 [ 0, %b0 ], [ %v23, %b4 ]
+ %v4 = phi <16 x i32> [ poison, %b0 ], [ %v22, %b4 ]
+ br i1 poison, label %b2, label %b3
+
+b2: ; preds = %b1
+ %v5 = getelementptr inbounds <16 x i32>, <16 x i32>* %v1, i32 %v3
+ %v6 = load <16 x i32>, <16 x i32>* %v5, align 64
+ %v7 = getelementptr inbounds <16 x i32>, <16 x i32>* %v2, i32 %v3
+ %v8 = load <16 x i32>, <16 x i32>* %v7, align 64
+ %v9 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiewuh.acc(<16 x i32> poison, <16 x i32> %v6, <16 x i32> %v6)
+ br label %b4
+
+b3: ; preds = %b1
+ br label %b4
+
+b4: ; preds = %b3, %b2
+ %v10 = phi <16 x i32> [ %v9, %b2 ], [ poison, %b3 ]
+ %v11 = phi <16 x i32> [ %v8, %b2 ], [ poison, %b3 ]
+ %v12 = tail call <16 x i32> @llvm.hexagon.V6.vmux(<64 x i1> poison, <16 x i32> %v10, <16 x i32> %v4)
+ %v13 = tail call <16 x i32> @llvm.hexagon.V6.vmux(<64 x i1> poison, <16 x i32> %v11, <16 x i32> poison)
+ %v14 = or i32 %v3, 1
+ %v15 = getelementptr inbounds <16 x i32>, <16 x i32>* %v2, i32 %v14
+ %v16 = load <16 x i32>, <16 x i32>* %v15, align 64
+ %v17 = tail call <16 x i32> @llvm.hexagon.V6.vmpyiewuh.acc(<16 x i32> poison, <16 x i32> %v13, <16 x i32> poison)
+ %v18 = tail call <16 x i32> @llvm.hexagon.V6.vand(<16 x i32> %v12, <16 x i32> poison)
+ %v19 = tail call <64 x i1> @llvm.hexagon.V6.vgtuw.and(<64 x i1> poison, <16 x i32> %v17, <16 x i32> poison)
+ %v20 = tail call <64 x i1> @llvm.hexagon.V6.pred.or(<64 x i1> %v19, <64 x i1> poison)
+ %v21 = tail call <64 x i1> @llvm.hexagon.V6.pred.or(<64 x i1> %v20, <64 x i1> poison)
+ %v22 = tail call <16 x i32> @llvm.hexagon.V6.vmux(<64 x i1> %v21, <16 x i32> poison, <16 x i32> %v12)
+ %v23 = add nuw nsw i32 %v3, 2
+ %v24 = icmp slt i32 %v23, %a1
+ br i1 %v24, label %b1, label %b5, !llvm.loop !1
+
+b5: ; preds = %b4
+ ret <16 x i32> %v22
+}
+
+attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
+attributes #1 = { "target-features"="+hvx-length64b,+hvxv65,+v65,-long-calls" }
+
+!llvm.linker.options = !{!0}
+
+!0 = !{!"$.str.3", !".rodata.str1.1"}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.mustprogress"}
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
index 9d8074177a1d3..96bf53e0610cd 100644
--- a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
+++ b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
@@ -28,11 +28,11 @@ define <64 x i16> @f0(i16* %a0, i32 %a1) #0 {
; CHECK-NEXT: [[TMP11:%.*]] = load <32 x i32>, <32 x i32>* [[TMP10]], align 128, !tbaa [[TBAA0]]
; CHECK-NEXT: [[TMP12:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
; CHECK-NEXT: [[TMP13:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP12]], <32 x i32> [[TMP5]], i32 [[TMP3]])
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
-; CHECK-NEXT: [[TMP15:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP11]], <32 x i32> [[TMP14]], i32 [[TMP3]])
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast <32 x i32> [[TMP13]] to <64 x i16>
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast <32 x i32> [[TMP15]] to <64 x i16>
-; CHECK-NEXT: [[V8:%.*]] = add <64 x i16> [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <32 x i32> [[TMP13]] to <64 x i16>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP11]], <32 x i32> [[TMP15]], i32 [[TMP3]])
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <32 x i32> [[TMP16]] to <64 x i16>
+; CHECK-NEXT: [[V8:%.*]] = add <64 x i16> [[TMP14]], [[TMP17]]
; CHECK-NEXT: ret <64 x i16> [[V8]]
;
b0:
@@ -69,11 +69,11 @@ define <64 x i16> @f1(i16* %a0, i32 %a1) #0 {
; CHECK-NEXT: [[TMP11:%.*]] = load <32 x i32>, <32 x i32>* [[TMP10]], align 128
; CHECK-NEXT: [[TMP12:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
; CHECK-NEXT: [[TMP13:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP12]], <32 x i32> [[TMP5]], i32 [[TMP3]])
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
-; CHECK-NEXT: [[TMP15:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP11]], <32 x i32> [[TMP14]], i32 [[TMP3]])
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast <32 x i32> [[TMP13]] to <64 x i16>
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast <32 x i32> [[TMP15]] to <64 x i16>
-; CHECK-NEXT: [[V8:%.*]] = add <64 x i16> [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <32 x i32> [[TMP13]] to <64 x i16>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP11]], <32 x i32> [[TMP15]], i32 [[TMP3]])
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <32 x i32> [[TMP16]] to <64 x i16>
+; CHECK-NEXT: [[V8:%.*]] = add <64 x i16> [[TMP14]], [[TMP17]]
; CHECK-NEXT: ret <64 x i16> [[V8]]
;
b0:
@@ -110,11 +110,11 @@ define <64 x i16> @f2(i16* %a0, i32 %a1) #0 {
; CHECK-NEXT: [[TMP11:%.*]] = load <32 x i32>, <32 x i32>* [[TMP10]], align 128, !tbaa [[TBAA3:![0-9]+]]
; CHECK-NEXT: [[TMP12:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
; CHECK-NEXT: [[TMP13:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP12]], <32 x i32> [[TMP5]], i32 [[TMP3]])
-; CHECK-NEXT: [[TMP14:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
-; CHECK-NEXT: [[TMP15:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP11]], <32 x i32> [[TMP14]], i32 [[TMP3]])
-; CHECK-NEXT: [[TMP16:%.*]] = bitcast <32 x i32> [[TMP13]] to <64 x i16>
-; CHECK-NEXT: [[TMP17:%.*]] = bitcast <32 x i32> [[TMP15]] to <64 x i16>
-; CHECK-NEXT: [[V8:%.*]] = add <64 x i16> [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP14:%.*]] = bitcast <32 x i32> [[TMP13]] to <64 x i16>
+; CHECK-NEXT: [[TMP15:%.*]] = bitcast <128 x i8> [[TMP8]] to <32 x i32>
+; CHECK-NEXT: [[TMP16:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[TMP11]], <32 x i32> [[TMP15]], i32 [[TMP3]])
+; CHECK-NEXT: [[TMP17:%.*]] = bitcast <32 x i32> [[TMP16]] to <64 x i16>
+; CHECK-NEXT: [[V8:%.*]] = add <64 x i16> [[TMP14]], [[TMP17]]
; CHECK-NEXT: ret <64 x i16> [[V8]]
;
b0:
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-use-in-
diff erent-block.ll b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-use-in-
diff erent-block.ll
new file mode 100644
index 0000000000000..74405c0f141d4
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-use-in-
diff erent-block.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; This used to crash because of calling isSafeToMoveBeforeInBB with source
+; and target in
diff erent blocks.
+; Check that this compiles successfully, and that two loads are created
+; (for users in a
diff erent block).
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1124:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local <32 x i32> @f0(i32 %a0, i32 %a1) local_unnamed_addr #0 {
+; CHECK-LABEL: f0:
+; CHECK: = vmem({{.*}})
+; CHECK: = vmem({{.*}})
+b0:
+ br label %b1
+
+b1: ; preds = %b0
+ %v0 = mul nsw i32 -4, %a0
+ %v1 = getelementptr inbounds i8, i8* null, i32 %v0
+ %v2 = getelementptr inbounds i8, i8* %v1, i32 -64
+ %v3 = bitcast i8* %v2 to <16 x i32>*
+ %v4 = load <16 x i32>, <16 x i32>* %v3, align 64
+ %v5 = getelementptr inbounds i8, i8* %v1, i32 64
+ %v6 = bitcast i8* %v5 to <16 x i32>*
+ %v7 = load <16 x i32>, <16 x i32>* %v6, align 64
+ br label %b2
+
+b2: ; preds = %b2, %b1
+ %v8 = phi <32 x i32> [ poison, %b1 ], [ %v17, %b2 ]
+ %v9 = phi i32 [ %a1, %b1 ], [ %v18, %b2 ]
+ %v10 = tail call <16 x i32> @llvm.hexagon.V6.vlalignb(<16 x i32> poison, <16 x i32> %v4, i32 poison)
+ %v11 = tail call <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32> %v7, <16 x i32> poison, i32 poison)
+ %v12 = tail call <32 x i32> @llvm.hexagon.V6.vmpyubv(<16 x i32> %v10, <16 x i32> poison)
+ %v13 = tail call <32 x i32> @llvm.hexagon.V6.vmpyubv(<16 x i32> %v11, <16 x i32> poison)
+ %v14 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v12)
+ %v15 = tail call <16 x i32> @llvm.hexagon.V6.hi(<32 x i32> %v13)
+ %v16 = tail call <32 x i32> @llvm.hexagon.V6.vadduhw(<16 x i32> %v14, <16 x i32> %v15)
+ %v17 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v8, <32 x i32> %v16)
+ %v18 = add nsw i32 %v9, -1
+ %v19 = icmp ugt i32 %v18, 1
+ br i1 %v19, label %b2, label %b3
+
+b3: ; preds = %b2
+ %v20 = tail call <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32> %v17, <32 x i32> %v16)
+ ret <32 x i32> %v20
+}
+
+declare <16 x i32> @llvm.hexagon.V6.hi(<32 x i32>) #1
+declare <16 x i32> @llvm.hexagon.V6.vlalignb(<16 x i32>, <16 x i32>, i32) #1
+declare <16 x i32> @llvm.hexagon.V6.valignb(<16 x i32>, <16 x i32>, i32) #1
+declare <32 x i32> @llvm.hexagon.V6.vmpyubv(<16 x i32>, <16 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vaddw.dv(<32 x i32>, <32 x i32>) #1
+declare <32 x i32> @llvm.hexagon.V6.vadduhw(<16 x i32>, <16 x i32>) #1
+
+attributes #0 = { "target-features"="+hvx-length64b,+hvxv66,+v66,-long-calls" }
+attributes #1 = { nounwind memory(none) }
More information about the llvm-commits
mailing list