[llvm-branch-commits] [llvm] cfc6073 - [GlobalISel] Combine (a[0]) | (a[1] << k1) | ...| (a[m] << kn) into a wide load
Jessica Paquette via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Tue Jan 19 10:28:58 PST 2021
Author: Jessica Paquette
Date: 2021-01-19T10:24:27-08:00
New Revision: cfc60730179042a93cb9cb338982e71d20707a24
URL: https://github.com/llvm/llvm-project/commit/cfc60730179042a93cb9cb338982e71d20707a24
DIFF: https://github.com/llvm/llvm-project/commit/cfc60730179042a93cb9cb338982e71d20707a24.diff
LOG: [GlobalISel] Combine (a[0]) | (a[1] << k1) | ...| (a[m] << kn) into a wide load
This is a restricted version of the combine in `DAGCombiner::MatchLoadCombine`.
(See D27861)
This tries to recognize patterns like below (assuming a little-endian target):
```
s8* x = ...
s32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
->
s32 val = *((i32)a)
s8* x = ...
s32 val = a[3] | (a[2] << 8) | (a[1] << 16) | (a[0] << 24)
->
s32 val = BSWAP(*((s32)a))
```
(This patch also handles the big-endian target case as well, in which the first
example above has a BSWAP, and the second example above does not.)
To recognize the pattern, this searches from the last G_OR in the expression
tree.
E.g.
```
Reg Reg
\ /
OR_1 Reg
\ /
OR_2
\ Reg
.. /
Root
```
Each non-OR register in the tree is put in a list. Each register in the list is
then checked to see if it's an appropriate load + shift logic.
If every register is a load + potentially a shift, the combine checks if those
loads + shifts, when OR'd together, are equivalent to a wide load (possibly with
a BSWAP.)
To simplify things, this patch
(1) Only handles G_ZEXTLOADs (which appear to be the common case)
(2) Only works in a single MachineBasicBlock
(3) Only handles G_SHL as the bit twiddling to stick the small load into a
specific location
An IR example of this is here: https://godbolt.org/z/4sP9Pj (lifted from
test/CodeGen/AArch64/load-combine.ll)
At -Os on AArch64, this is a 0.5% code size improvement for CTMark/sqlite3,
and a 0.4% improvement for CTMark/7zip-benchmark.
Also fix a bug in `isPredecessor` which caused it to fail whenever `DefMI` was
the first instruction in the block.
Differential Revision: https://reviews.llvm.org/D94350
Added:
llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern-align.mir
llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern.mir
Modified:
llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
llvm/include/llvm/CodeGen/TargetLowering.h
llvm/include/llvm/Target/GlobalISel/Combine.td
llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
llvm/lib/CodeGen/TargetLoweringBase.cpp
Removed:
################################################################################
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 0d240e90820f..8570f5ca5dd5 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -18,6 +18,7 @@
#define LLVM_CODEGEN_GLOBALISEL_COMBINER_HELPER_H
#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/DenseMap.h"
#include "llvm/CodeGen/LowLevelType.h"
#include "llvm/CodeGen/Register.h"
#include "llvm/Support/Alignment.h"
@@ -471,6 +472,20 @@ class CombinerHelper {
bool applyCombineInsertVecElts(MachineInstr &MI,
SmallVectorImpl<Register> &MatchInfo);
+ /// Match expression trees of the form
+ ///
+ /// \code
+ /// sN *a = ...
+ /// sM val = a[0] | (a[1] << N) | (a[2] << 2N) | (a[3] << 3N) ...
+ /// \endcode
+ ///
+ /// And check if the tree can be replaced with a M-bit load + possibly a
+ /// bswap.
+ bool matchLoadOrCombine(MachineInstr &MI,
+ std::function<void(MachineIRBuilder &)> &MatchInfo);
+ bool applyLoadOrCombine(MachineInstr &MI,
+ std::function<void(MachineIRBuilder &)> &MatchInfo);
+
/// Try to transform \p MI by using all of the above
/// combine functions. Returns true if changed.
bool tryCombine(MachineInstr &MI);
@@ -499,6 +514,30 @@ class CombinerHelper {
/// \returns true if a candidate is found.
bool findPreIndexCandidate(MachineInstr &MI, Register &Addr, Register &Base,
Register &Offset);
+
+ /// Helper function for matchLoadOrCombine. Searches for Registers
+ /// which may have been produced by a load instruction + some arithmetic.
+ ///
+ /// \param [in] Root - The search root.
+ ///
+ /// \returns The Registers found during the search.
+ Optional<SmallVector<Register, 8>>
+ findCandidatesForLoadOrCombine(const MachineInstr *Root) const;
+
+ /// Helper function for matchLoadOrCombine.
+ ///
+ /// Checks if every register in \p RegsToVisit is defined by a load
+ /// instruction + some arithmetic.
+ ///
+ /// \param [out] MemOffset2Idx - Maps the byte positions each load ends up
+ /// at to the index of the load.
+ /// \param [in] MemSizeInBits - The number of bits each load should produce.
+ ///
+ /// \returns The lowest-index load found and the lowest index on success.
+ Optional<std::pair<MachineInstr *, int64_t>> findLoadOffsetsForLoadOrCombine(
+ SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
+ const SmallVector<Register, 8> &RegsToVisit,
+ const unsigned MemSizeInBits);
};
} // namespace llvm
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 305107c48750..5a237074a5a3 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1658,6 +1658,11 @@ class TargetLoweringBase {
const MachineMemOperand &MMO,
bool *Fast = nullptr) const;
+ /// LLT handling variant.
+ bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, LLT Ty,
+ const MachineMemOperand &MMO,
+ bool *Fast = nullptr) const;
+
/// Returns the target specific optimal type for load and store operations as
/// a result of memset, memcpy, and memmove lowering.
/// It returns EVT::Other if the type should be determined using generic
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index e352e499d47c..e2c7a90a1b16 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -545,6 +545,14 @@ def combine_insert_vec_elts_build_vector : GICombineRule<
[{ return Helper.matchCombineInsertVecElts(*${root}, ${info}); }]),
(apply [{ return Helper.applyCombineInsertVecElts(*${root}, ${info}); }])>;
+def load_or_combine_matchdata :
+GIDefMatchData<"std::function<void(MachineIRBuilder &)>">;
+def load_or_combine : GICombineRule<
+ (defs root:$root, load_or_combine_matchdata:$info),
+ (match (wip_match_opcode G_OR):$root,
+ [{ return Helper.matchLoadOrCombine(*${root}, ${info}); }]),
+ (apply [{ return Helper.applyLoadOrCombine(*${root}, ${info}); }])>;
+
// Currently only the one combine above.
def insert_vec_elt_combines : GICombineGroup<
[combine_insert_vec_elts_build_vector]>;
@@ -587,4 +595,4 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
unmerge_merge, fabs_fabs_fold, unmerge_cst, unmerge_dead_to_trunc,
unmerge_zext_to_zext, trunc_ext_fold, trunc_shl,
const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
- shift_immed_chain, shift_of_shifted_logic_chain]>;
+ shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine]>;
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index bbcf32a73fe0..c142c7a70c95 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -48,6 +48,66 @@ const TargetLowering &CombinerHelper::getTargetLowering() const {
return *Builder.getMF().getSubtarget().getTargetLowering();
}
+/// \returns The little endian in-memory byte position of byte \p I in a
+/// \p ByteWidth bytes wide type.
+///
+/// E.g. Given a 4-byte type x, x[0] -> byte 0
+static unsigned littleEndianByteAt(const unsigned ByteWidth, const unsigned I) {
+ assert(I < ByteWidth && "I must be in [0, ByteWidth)");
+ return I;
+}
+
+/// \returns The big endian in-memory byte position of byte \p I in a
+/// \p ByteWidth bytes wide type.
+///
+/// E.g. Given a 4-byte type x, x[0] -> byte 3
+static unsigned bigEndianByteAt(const unsigned ByteWidth, const unsigned I) {
+ assert(I < ByteWidth && "I must be in [0, ByteWidth)");
+ return ByteWidth - I - 1;
+}
+
+/// Given a map from byte offsets in memory to indices in a load/store,
+/// determine if that map corresponds to a little or big endian byte pattern.
+///
+/// \param MemOffset2Idx maps memory offsets to address offsets.
+/// \param LowestIdx is the lowest index in \p MemOffset2Idx.
+///
+/// \returns true if the map corresponds to a big endian byte pattern, false
+/// if it corresponds to a little endian byte pattern, and None otherwise.
+///
+/// E.g. given a 32-bit type x, and x[AddrOffset], the in-memory byte patterns
+/// are as follows:
+///
+/// AddrOffset Little endian Big endian
+/// 0 0 3
+/// 1 1 2
+/// 2 2 1
+/// 3 3 0
+static Optional<bool>
+isBigEndian(const SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
+ int64_t LowestIdx) {
+ // Need at least two byte positions to decide on endianness.
+ unsigned Width = MemOffset2Idx.size();
+ if (Width < 2)
+ return None;
+ bool BigEndian = true, LittleEndian = true;
+ for (unsigned MemOffset = 0; MemOffset < Width; ++ MemOffset) {
+ auto MemOffsetAndIdx = MemOffset2Idx.find(MemOffset);
+ if (MemOffsetAndIdx == MemOffset2Idx.end())
+ return None;
+ const int64_t Idx = MemOffsetAndIdx->second - LowestIdx;
+ assert(Idx >= 0 && "Expected non-negative byte offset?");
+ LittleEndian &= Idx == littleEndianByteAt(Width, MemOffset);
+ BigEndian &= Idx == bigEndianByteAt(Width, MemOffset);
+ if (!BigEndian && !LittleEndian)
+ return None;
+ }
+
+ assert((BigEndian != LittleEndian) &&
+ "Pattern cannot be both big and little endian!");
+ return BigEndian;
+}
+
bool CombinerHelper::isLegalOrBeforeLegalizer(
const LegalityQuery &Query) const {
return !LI || LI->getAction(Query).Action == LegalizeActions::Legal;
@@ -564,13 +624,16 @@ bool CombinerHelper::isPredecessor(const MachineInstr &DefMI,
assert(DefMI.getParent() == UseMI.getParent());
if (&DefMI == &UseMI)
return false;
-
- // Loop through the basic block until we find one of the instructions.
- MachineBasicBlock::const_iterator I = DefMI.getParent()->begin();
- for (; &*I != &DefMI && &*I != &UseMI; ++I)
- return &*I == &DefMI;
-
- llvm_unreachable("Block must contain instructions");
+ const MachineBasicBlock &MBB = *DefMI.getParent();
+ auto NonDbgInsts =
+ instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end());
+ auto DefOrUse =
+ find_if(NonDbgInsts, [&DefMI, &UseMI](const MachineInstr &MI) {
+ return &MI == &DefMI || &MI == &UseMI;
+ });
+ if (DefOrUse == NonDbgInsts.end())
+ llvm_unreachable("Block must contain both DefMI and UseMI!");
+ return &*DefOrUse == &DefMI;
}
bool CombinerHelper::dominates(const MachineInstr &DefMI,
@@ -3152,6 +3215,361 @@ bool CombinerHelper::applySimplifyURemByPow2(MachineInstr &MI) {
return true;
}
+Optional<SmallVector<Register, 8>>
+CombinerHelper::findCandidatesForLoadOrCombine(const MachineInstr *Root) const {
+ assert(Root->getOpcode() == TargetOpcode::G_OR && "Expected G_OR only!");
+ // We want to detect if Root is part of a tree which represents a bunch
+ // of loads being merged into a larger load. We'll try to recognize patterns
+ // like, for example:
+ //
+ // Reg Reg
+ // \ /
+ // OR_1 Reg
+ // \ /
+ // OR_2
+ // \ Reg
+ // .. /
+ // Root
+ //
+ // Reg Reg Reg Reg
+ // \ / \ /
+ // OR_1 OR_2
+ // \ /
+ // \ /
+ // ...
+ // Root
+ //
+ // Each "Reg" may have been produced by a load + some arithmetic. This
+ // function will save each of them.
+ SmallVector<Register, 8> RegsToVisit;
+ SmallVector<const MachineInstr *, 7> Ors = {Root};
+
+ // In the "worst" case, we're dealing with a load for each byte. So, there
+ // are at most #bytes - 1 ORs.
+ const unsigned MaxIter =
+ MRI.getType(Root->getOperand(0).getReg()).getSizeInBytes() - 1;
+ for (unsigned Iter = 0; Iter < MaxIter; ++Iter) {
+ if (Ors.empty())
+ break;
+ const MachineInstr *Curr = Ors.pop_back_val();
+ Register OrLHS = Curr->getOperand(1).getReg();
+ Register OrRHS = Curr->getOperand(2).getReg();
+
+ // In the combine, we want to elimate the entire tree.
+ if (!MRI.hasOneNonDBGUse(OrLHS) || !MRI.hasOneNonDBGUse(OrRHS))
+ return None;
+
+ // If it's a G_OR, save it and continue to walk. If it's not, then it's
+ // something that may be a load + arithmetic.
+ if (const MachineInstr *Or = getOpcodeDef(TargetOpcode::G_OR, OrLHS, MRI))
+ Ors.push_back(Or);
+ else
+ RegsToVisit.push_back(OrLHS);
+ if (const MachineInstr *Or = getOpcodeDef(TargetOpcode::G_OR, OrRHS, MRI))
+ Ors.push_back(Or);
+ else
+ RegsToVisit.push_back(OrRHS);
+ }
+
+ // We're going to try and merge each register into a wider power-of-2 type,
+ // so we ought to have an even number of registers.
+ if (RegsToVisit.empty() || RegsToVisit.size() % 2 != 0)
+ return None;
+ return RegsToVisit;
+}
+
+/// Helper function for findLoadOffsetsForLoadOrCombine.
+///
+/// Check if \p Reg is the result of loading a \p MemSizeInBits wide value,
+/// and then moving that value into a specific byte offset.
+///
+/// e.g. x[i] << 24
+///
+/// \returns The load instruction and the byte offset it is moved into.
+static Optional<std::pair<MachineInstr *, int64_t>>
+matchLoadAndBytePosition(Register Reg, unsigned MemSizeInBits,
+ const MachineRegisterInfo &MRI) {
+ assert(MRI.hasOneNonDBGUse(Reg) &&
+ "Expected Reg to only have one non-debug use?");
+ Register MaybeLoad;
+ int64_t Shift;
+ if (!mi_match(Reg, MRI,
+ m_OneNonDBGUse(m_GShl(m_Reg(MaybeLoad), m_ICst(Shift))))) {
+ Shift = 0;
+ MaybeLoad = Reg;
+ }
+
+ if (Shift % MemSizeInBits != 0)
+ return None;
+
+ // TODO: Handle other types of loads.
+ auto *Load = getOpcodeDef(TargetOpcode::G_ZEXTLOAD, MaybeLoad, MRI);
+ if (!Load)
+ return None;
+
+ const auto &MMO = **Load->memoperands_begin();
+ if (!MMO.isUnordered() || MMO.getSizeInBits() != MemSizeInBits)
+ return None;
+
+ return std::make_pair(Load, Shift / MemSizeInBits);
+}
+
+Optional<std::pair<MachineInstr *, int64_t>>
+CombinerHelper::findLoadOffsetsForLoadOrCombine(
+ SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
+ const SmallVector<Register, 8> &RegsToVisit, const unsigned MemSizeInBits) {
+
+ // Each load found for the pattern. There should be one for each RegsToVisit.
+ SmallSetVector<const MachineInstr *, 8> Loads;
+
+ // The lowest index used in any load. (The lowest "i" for each x[i].)
+ int64_t LowestIdx = INT64_MAX;
+
+ // The load which uses the lowest index.
+ MachineInstr *LowestIdxLoad = nullptr;
+
+ // Keeps track of the load indices we see. We shouldn't see any indices twice.
+ SmallSet<int64_t, 8> SeenIdx;
+
+ // Ensure each load is in the same MBB.
+ // TODO: Support multiple MachineBasicBlocks.
+ MachineBasicBlock *MBB = nullptr;
+ const MachineMemOperand *MMO = nullptr;
+
+ // Earliest instruction-order load in the pattern.
+ MachineInstr *EarliestLoad = nullptr;
+
+ // Latest instruction-order load in the pattern.
+ MachineInstr *LatestLoad = nullptr;
+
+ // Base pointer which every load should share.
+ Register BasePtr;
+
+ // We want to find a load for each register. Each load should have some
+ // appropriate bit twiddling arithmetic. During this loop, we will also keep
+ // track of the load which uses the lowest index. Later, we will check if we
+ // can use its pointer in the final, combined load.
+ for (auto Reg : RegsToVisit) {
+ // Find the load, and find the position that it will end up in (e.g. a
+ // shifted) value.
+ auto LoadAndPos = matchLoadAndBytePosition(Reg, MemSizeInBits, MRI);
+ if (!LoadAndPos)
+ return None;
+ MachineInstr *Load;
+ int64_t DstPos;
+ std::tie(Load, DstPos) = *LoadAndPos;
+
+ // TODO: Handle multiple MachineBasicBlocks. Currently not handled because
+ // it is
diff icult to check for stores/calls/etc between loads.
+ MachineBasicBlock *LoadMBB = Load->getParent();
+ if (!MBB)
+ MBB = LoadMBB;
+ if (LoadMBB != MBB)
+ return None;
+
+ // Make sure that the MachineMemOperands of every seen load are compatible.
+ const MachineMemOperand *LoadMMO = *Load->memoperands_begin();
+ if (!MMO)
+ MMO = LoadMMO;
+ if (MMO->getAddrSpace() != LoadMMO->getAddrSpace())
+ return None;
+
+ // Find out what the base pointer and index for the load is.
+ Register LoadPtr;
+ int64_t Idx;
+ if (!mi_match(Load->getOperand(1).getReg(), MRI,
+ m_GPtrAdd(m_Reg(LoadPtr), m_ICst(Idx)))) {
+ LoadPtr = Load->getOperand(1).getReg();
+ Idx = 0;
+ }
+
+ // Don't combine things like a[i], a[i] -> a bigger load.
+ if (!SeenIdx.insert(Idx).second)
+ return None;
+
+ // Every load must share the same base pointer; don't combine things like:
+ //
+ // a[i], b[i + 1] -> a bigger load.
+ if (!BasePtr.isValid())
+ BasePtr = LoadPtr;
+ if (BasePtr != LoadPtr)
+ return None;
+
+ if (Idx < LowestIdx) {
+ LowestIdx = Idx;
+ LowestIdxLoad = Load;
+ }
+
+ // Keep track of the byte offset that this load ends up at. If we have seen
+ // the byte offset, then stop here. We do not want to combine:
+ //
+ // a[i] << 16, a[i + k] << 16 -> a bigger load.
+ if (!MemOffset2Idx.try_emplace(DstPos, Idx).second)
+ return None;
+ Loads.insert(Load);
+
+ // Keep track of the position of the earliest/latest loads in the pattern.
+ // We will check that there are no load fold barriers between them later
+ // on.
+ //
+ // FIXME: Is there a better way to check for load fold barriers?
+ if (!EarliestLoad || dominates(*Load, *EarliestLoad))
+ EarliestLoad = Load;
+ if (!LatestLoad || dominates(*LatestLoad, *Load))
+ LatestLoad = Load;
+ }
+
+ // We found a load for each register. Let's check if each load satisfies the
+ // pattern.
+ assert(Loads.size() == RegsToVisit.size() &&
+ "Expected to find a load for each register?");
+ assert(EarliestLoad != LatestLoad && EarliestLoad &&
+ LatestLoad && "Expected at least two loads?");
+
+ // Check if there are any stores, calls, etc. between any of the loads. If
+ // there are, then we can't safely perform the combine.
+ //
+ // MaxIter is chosen based off the (worst case) number of iterations it
+ // typically takes to succeed in the LLVM test suite plus some padding.
+ //
+ // FIXME: Is there a better way to check for load fold barriers?
+ const unsigned MaxIter = 20;
+ unsigned Iter = 0;
+ for (const auto &MI : instructionsWithoutDebug(EarliestLoad->getIterator(),
+ LatestLoad->getIterator())) {
+ if (Loads.count(&MI))
+ continue;
+ if (MI.isLoadFoldBarrier())
+ return None;
+ if (Iter++ == MaxIter)
+ return None;
+ }
+
+ return std::make_pair(LowestIdxLoad, LowestIdx);
+}
+
+bool CombinerHelper::matchLoadOrCombine(
+ MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+ assert(MI.getOpcode() == TargetOpcode::G_OR);
+ MachineFunction &MF = *MI.getMF();
+ // Assuming a little-endian target, transform:
+ // s8 *a = ...
+ // s32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
+ // =>
+ // s32 val = *((i32)a)
+ //
+ // s8 *a = ...
+ // s32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]
+ // =>
+ // s32 val = BSWAP(*((s32)a))
+ Register Dst = MI.getOperand(0).getReg();
+ LLT Ty = MRI.getType(Dst);
+ if (Ty.isVector())
+ return false;
+
+ // We need to combine at least two loads into this type. Since the smallest
+ // possible load is into a byte, we need at least a 16-bit wide type.
+ const unsigned WideMemSizeInBits = Ty.getSizeInBits();
+ if (WideMemSizeInBits < 16 || WideMemSizeInBits % 8 != 0)
+ return false;
+
+ // Match a collection of non-OR instructions in the pattern.
+ auto RegsToVisit = findCandidatesForLoadOrCombine(&MI);
+ if (!RegsToVisit)
+ return false;
+
+ // We have a collection of non-OR instructions. Figure out how wide each of
+ // the small loads should be based off of the number of potential loads we
+ // found.
+ const unsigned NarrowMemSizeInBits = WideMemSizeInBits / RegsToVisit->size();
+ if (NarrowMemSizeInBits % 8 != 0)
+ return false;
+
+ // Check if each register feeding into each OR is a load from the same
+ // base pointer + some arithmetic.
+ //
+ // e.g. a[0], a[1] << 8, a[2] << 16, etc.
+ //
+ // Also verify that each of these ends up putting a[i] into the same memory
+ // offset as a load into a wide type would.
+ SmallDenseMap<int64_t, int64_t, 8> MemOffset2Idx;
+ MachineInstr *LowestIdxLoad;
+ int64_t LowestIdx;
+ auto MaybeLoadInfo = findLoadOffsetsForLoadOrCombine(
+ MemOffset2Idx, *RegsToVisit, NarrowMemSizeInBits);
+ if (!MaybeLoadInfo)
+ return false;
+ std::tie(LowestIdxLoad, LowestIdx) = *MaybeLoadInfo;
+
+ // We have a bunch of loads being OR'd together. Using the addresses + offsets
+ // we found before, check if this corresponds to a big or little endian byte
+ // pattern. If it does, then we can represent it using a load + possibly a
+ // BSWAP.
+ bool IsBigEndianTarget = MF.getDataLayout().isBigEndian();
+ Optional<bool> IsBigEndian = isBigEndian(MemOffset2Idx, LowestIdx);
+ if (!IsBigEndian.hasValue())
+ return false;
+ bool NeedsBSwap = IsBigEndianTarget != *IsBigEndian;
+ if (NeedsBSwap && !isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {Ty}}))
+ return false;
+
+ // Make sure that the load from the lowest index produces offset 0 in the
+ // final value.
+ //
+ // This ensures that we won't combine something like this:
+ //
+ // load x[i] -> byte 2
+ // load x[i+1] -> byte 0 ---> wide_load x[i]
+ // load x[i+2] -> byte 1
+ const unsigned NumLoadsInTy = WideMemSizeInBits / NarrowMemSizeInBits;
+ const unsigned ZeroByteOffset =
+ *IsBigEndian
+ ? bigEndianByteAt(NumLoadsInTy, 0)
+ : littleEndianByteAt(NumLoadsInTy, 0);
+ auto ZeroOffsetIdx = MemOffset2Idx.find(ZeroByteOffset);
+ if (ZeroOffsetIdx == MemOffset2Idx.end() ||
+ ZeroOffsetIdx->second != LowestIdx)
+ return false;
+
+ // We wil reuse the pointer from the load which ends up at byte offset 0. It
+ // may not use index 0.
+ Register Ptr = LowestIdxLoad->getOperand(1).getReg();
+ const MachineMemOperand &MMO = **LowestIdxLoad->memoperands_begin();
+ LegalityQuery::MemDesc MMDesc;
+ MMDesc.SizeInBits = WideMemSizeInBits;
+ MMDesc.AlignInBits = MMO.getAlign().value() * 8;
+ MMDesc.Ordering = MMO.getOrdering();
+ if (!isLegalOrBeforeLegalizer(
+ {TargetOpcode::G_LOAD, {Ty, MRI.getType(Ptr)}, {MMDesc}}))
+ return false;
+ auto PtrInfo = MMO.getPointerInfo();
+ auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, WideMemSizeInBits / 8);
+
+ // Load must be allowed and fast on the target.
+ LLVMContext &C = MF.getFunction().getContext();
+ auto &DL = MF.getDataLayout();
+ bool Fast = false;
+ if (!getTargetLowering().allowsMemoryAccess(C, DL, Ty, *NewMMO, &Fast) ||
+ !Fast)
+ return false;
+
+ MatchInfo = [=](MachineIRBuilder &MIB) {
+ Register LoadDst = NeedsBSwap ? MRI.cloneVirtualRegister(Dst) : Dst;
+ MIB.buildLoad(LoadDst, Ptr, *NewMMO);
+ if (NeedsBSwap)
+ MIB.buildBSwap(Dst, LoadDst);
+ };
+ return true;
+}
+
+bool CombinerHelper::applyLoadOrCombine(
+ MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+ Builder.setInstrAndDebugLoc(MI);
+ MatchInfo(Builder);
+ MI.eraseFromParent();
+ return true;
+}
+
bool CombinerHelper::tryCombine(MachineInstr &MI) {
if (tryCombineCopy(MI))
return true;
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 6cacb10ab79f..f639f7295b57 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1756,6 +1756,14 @@ bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
MMO.getFlags(), Fast);
}
+bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
+ const DataLayout &DL, LLT Ty,
+ const MachineMemOperand &MMO,
+ bool *Fast) const {
+ return allowsMemoryAccess(Context, DL, getMVTForLLT(Ty), MMO.getAddrSpace(),
+ MMO.getAlign(), MMO.getFlags(), Fast);
+}
+
BranchProbability TargetLoweringBase::getPredictableBranchThreshold() const {
return BranchProbability(MinPercentageForPredictableBranch, 100);
}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern-align.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern-align.mir
new file mode 100644
index 000000000000..993a6713aaaa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern-align.mir
@@ -0,0 +1,79 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -debugify-and-strip-all-safe -mtriple aarch64 -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="load_or_combine" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=NOT_STRICT
+# RUN: llc -debugify-and-strip-all-safe -mattr=+strict-align -mtriple aarch64 -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="load_or_combine" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=STRICT
+
+# REQUIRES: asserts
+
+# Check that the load-or combine respects alignment requirements.
+...
+---
+name: misaligned
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+ ; NOT_STRICT-LABEL: name: misaligned
+ ; NOT_STRICT: liveins: $x0, $x1
+ ; NOT_STRICT: %ptr:_(p0) = COPY $x1
+ ; NOT_STRICT: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2)
+ ; NOT_STRICT: $w1 = COPY %full_load(s32)
+ ; NOT_STRICT: RET_ReallyLR implicit $w1
+ ; STRICT-LABEL: name: misaligned
+ ; STRICT: liveins: $x0, $x1
+ ; STRICT: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; STRICT: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; STRICT: %ptr:_(p0) = COPY $x1
+ ; STRICT: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; STRICT: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; STRICT: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; STRICT: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+ ; STRICT: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; STRICT: $w1 = COPY %full_load(s32)
+ ; STRICT: RET_ReallyLR implicit $w1
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_16:_(s32) = G_CONSTANT i32 16
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+ %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2, align 2)
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, align 2)
+ %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+ %full_load:_(s32) = G_OR %low_half, %high_half
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: aligned
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; NOT_STRICT-LABEL: name: aligned
+ ; NOT_STRICT: liveins: $x0, $x1
+ ; NOT_STRICT: %ptr:_(p0) = COPY $x1
+ ; NOT_STRICT: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4)
+ ; NOT_STRICT: $w1 = COPY %full_load(s32)
+ ; NOT_STRICT: RET_ReallyLR implicit $w1
+ ; STRICT-LABEL: name: aligned
+ ; STRICT: liveins: $x0, $x1
+ ; STRICT: %ptr:_(p0) = COPY $x1
+ ; STRICT: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4)
+ ; STRICT: $w1 = COPY %full_load(s32)
+ ; STRICT: RET_ReallyLR implicit $w1
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_16:_(s32) = G_CONSTANT i32 16
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+ %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2, align 4)
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, align 4)
+ %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+ %full_load:_(s32) = G_OR %low_half, %high_half
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern.mir
new file mode 100644
index 000000000000..bb1f5a0d36ac
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern.mir
@@ -0,0 +1,1571 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -debugify-and-strip-all-safe -mtriple aarch64 -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="load_or_combine" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=LITTLE
+# RUN: llc -debugify-and-strip-all-safe -mtriple arm64eb -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="load_or_combine" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=BIG
+
+# REQUIRES: asserts
+
+# Test that we can combine patterns like
+#
+# s8* x = ...
+# s32 y = (x[0] | (x[1] << 8)) | ((x[2] << 16) | (x[3] << 24))
+#
+# Into either a load, or a load with a bswap.
+
+...
+---
+name: s8_loads_to_s32_little_endian_pat
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; s8* x = ...
+ ; s32 y = (x[0] | (x[1] << 8)) | ((x[2] << 16) | (x[3] << 24))
+ ;
+ ; -> Little endian: Load from x[0]
+ ; -> Big endian: Load from x[0] + BSWAP
+
+ ; LITTLE-LABEL: name: s8_loads_to_s32_little_endian_pat
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1)
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: s8_loads_to_s32_little_endian_pat
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1)
+ ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]]
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s32) = G_CONSTANT i32 1
+ %cst_2:_(s32) = G_CONSTANT i32 2
+ %cst_3:_(s32) = G_CONSTANT i32 3
+
+ %cst_8:_(s32) = G_CONSTANT i32 8
+ %cst_16:_(s32) = G_CONSTANT i32 16
+ %cst_24:_(s32) = G_CONSTANT i32 24
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+ %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+
+ %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+ %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+
+ %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+ %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+ %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32)
+
+ ; Note the shape of the tree:
+ ;
+ ; byte byte byte byte
+ ; \ / \ /
+ ; OR OR
+ ; \ /
+ ; \ /
+ ; OR
+
+ %or1:_(s32) = G_OR %byte0, %byte1
+ %or2:_(s32) = G_OR %byte2, %byte3
+ %full_load:_(s32) = G_OR %or1, %or2
+
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: s8_loads_to_s32_big_endian_pat
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; s8* x = ...
+ ; s32 y = (x[0] << 24 | (x[1] << 16)) | ((x[2] << 8) | x[3]))
+ ;
+ ; -> Little endian: Load from x[0] + BSWAP
+ ; -> Big endian: Load from x[0]
+
+ ; LITTLE-LABEL: name: s8_loads_to_s32_big_endian_pat
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1)
+ ; LITTLE: %full_load:_(s32) = G_BSWAP [[LOAD]]
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: s8_loads_to_s32_big_endian_pat
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1)
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s32) = G_CONSTANT i32 1
+ %cst_2:_(s32) = G_CONSTANT i32 2
+ %cst_3:_(s32) = G_CONSTANT i32 3
+
+ %cst_8:_(s32) = G_CONSTANT i32 8
+ %cst_16:_(s32) = G_CONSTANT i32 16
+ %cst_24:_(s32) = G_CONSTANT i32 24
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+ %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+
+ %elt0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+
+ %byte0:_(s32) = nuw G_SHL %elt0, %cst_24(s32)
+ %byte1:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+ %byte2:_(s32) = nuw G_SHL %elt2, %cst_8(s32)
+ %byte3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+
+ %or1:_(s32) = G_OR %byte0, %byte1
+ %or2:_(s32) = G_OR %byte2, %byte3
+ %full_load:_(s32) = G_OR %or1, %or2
+
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name:
diff erent_or_pattern
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; Slightly
diff erent OR tree.
+ ;
+ ; s8* x = ...
+ ; s32 y = (((x[0] | (x[1] << 8)) | (x[2] << 16)) | (x[3] << 24))
+ ;
+ ; -> Little endian: Load from x[0]
+ ; -> Big endian: Load from x[0] + BSWAP
+
+ ; LITTLE-LABEL: name:
diff erent_or_pattern
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1)
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name:
diff erent_or_pattern
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1)
+ ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]]
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s32) = G_CONSTANT i32 1
+ %cst_2:_(s32) = G_CONSTANT i32 2
+ %cst_3:_(s32) = G_CONSTANT i32 3
+
+ %cst_8:_(s32) = G_CONSTANT i32 8
+ %cst_16:_(s32) = G_CONSTANT i32 16
+ %cst_24:_(s32) = G_CONSTANT i32 24
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+ %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+
+ %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+ %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+
+ %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+ %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+ %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32)
+
+ ; Note the shape of the tree:
+ ;
+ ; byte byte
+ ; \ /
+ ; OR_1 byte
+ ; \ /
+ ; OR_2
+ ; \
+ ; ...
+
+ %or1:_(s32) = G_OR %byte0, %byte1
+ %or2:_(s32) = G_OR %or1, %byte2
+ %full_load:_(s32) = G_OR %or2, %byte3
+
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: s16_loads_to_s32_little_endian_pat
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; s16* x = ...
+ ; s32 y = x[0] | (x[1] << 16)
+ ;
+ ; -> Little endian: Load from x[0]
+ ; -> Big endian: Load from x[0] + BSWAP
+
+ ; LITTLE-LABEL: name: s16_loads_to_s32_little_endian_pat
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2)
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: s16_loads_to_s32_little_endian_pat
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2)
+ ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]]
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_16:_(s32) = G_CONSTANT i32 16
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+ %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+ %full_load:_(s32) = G_OR %low_half, %high_half
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: s16_loads_to_s32_big_endian_pat
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; s16 *x = ...
+ ; s32 y = x[1] | (x[0] << 16)
+ ;
+ ; -> Little endian: Load from x[0] + BSWAP
+ ; -> Big endian: Load from x[0]
+
+ ; LITTLE-LABEL: name: s16_loads_to_s32_big_endian_pat
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2)
+ ; LITTLE: %full_load:_(s32) = G_BSWAP [[LOAD]]
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: s16_loads_to_s32_big_endian_pat
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2)
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_16:_(s32) = G_CONSTANT i32 16
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+ %elt0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ %high_half:_(s32) = nuw G_SHL %elt0, %cst_16(s32)
+ %low_half:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+
+ %full_load:_(s32) = G_OR %low_half, %high_half
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: s16_loads_to_s64_little_endian_pat
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; s16 *x = ...
+ ; s32 y = (x[0] | (x[1] << 16)) | ((x[2] << 32) | (x[3] << 48))
+ ;
+ ; -> Little endian: Load from x[0]
+ ; -> Big endian: Load from x[0] + BSWAP
+
+ ; LITTLE-LABEL: name: s16_loads_to_s64_little_endian_pat
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %full_load:_(s64) = G_LOAD %ptr(p0) :: (load 8, align 2)
+ ; LITTLE: $x1 = COPY %full_load(s64)
+ ; LITTLE: RET_ReallyLR implicit $x1
+ ; BIG-LABEL: name: s16_loads_to_s64_little_endian_pat
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD %ptr(p0) :: (load 8, align 2)
+ ; BIG: %full_load:_(s64) = G_BSWAP [[LOAD]]
+ ; BIG: $x1 = COPY %full_load(s64)
+ ; BIG: RET_ReallyLR implicit $x1
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_2:_(s64) = G_CONSTANT i64 2
+ %cst_3:_(s64) = G_CONSTANT i64 3
+
+ %cst_16:_(s64) = G_CONSTANT i64 16
+ %cst_32:_(s64) = G_CONSTANT i64 32
+ %cst_48:_(s64) = G_CONSTANT i64 48
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s64)
+ %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s64)
+
+ %byte0_byte1:_(s64) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+
+ %elt1:_(s64) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ %elt2:_(s64) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 2)
+ %elt3:_(s64) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 2)
+
+ %byte2_byte3:_(s64) = nuw G_SHL %elt1, %cst_16(s64)
+ %byte4_byte5:_(s64) = nuw G_SHL %elt2, %cst_32(s64)
+ %byte6_byte7:_(s64) = nuw G_SHL %elt3, %cst_48(s64)
+
+ %or1:_(s64) = G_OR %byte0_byte1, %byte2_byte3
+ %or2:_(s64) = G_OR %byte4_byte5, %byte6_byte7
+ %full_load:_(s64) = G_OR %or1, %or2
+
+ $x1 = COPY %full_load(s64)
+ RET_ReallyLR implicit $x1
+
+...
+---
+name: s16_loads_to_s64_big_endian_pat
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; s16 *x = ...
+ ; s64 y = (x[3] | (x[2] << 16)) | ((x[1] << 32) | (x[0] << 48))
+ ;
+ ; -> Little endian: Load from x[0] + BSWAP
+ ; -> Big endian: Load from x[0]
+
+ ; LITTLE-LABEL: name: s16_loads_to_s64_big_endian_pat
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD %ptr(p0) :: (load 8, align 2)
+ ; LITTLE: %full_load:_(s64) = G_BSWAP [[LOAD]]
+ ; LITTLE: $x1 = COPY %full_load(s64)
+ ; LITTLE: RET_ReallyLR implicit $x1
+ ; BIG-LABEL: name: s16_loads_to_s64_big_endian_pat
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %full_load:_(s64) = G_LOAD %ptr(p0) :: (load 8, align 2)
+ ; BIG: $x1 = COPY %full_load(s64)
+ ; BIG: RET_ReallyLR implicit $x1
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_2:_(s64) = G_CONSTANT i64 2
+ %cst_3:_(s64) = G_CONSTANT i64 3
+
+ %cst_16:_(s64) = G_CONSTANT i64 16
+ %cst_32:_(s64) = G_CONSTANT i64 32
+ %cst_48:_(s64) = G_CONSTANT i64 48
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s64)
+ %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s64)
+
+ %elt0:_(s64) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ %elt1:_(s64) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ %elt2:_(s64) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 2)
+
+ %byte0_byte1:_(s64) = nuw G_SHL %elt0, %cst_48(s64)
+ %byte2_byte3:_(s64) = nuw G_SHL %elt1, %cst_32(s64)
+ %byte4_byte5:_(s64) = nuw G_SHL %elt2, %cst_16(s64)
+ %byte6_byte7:_(s64) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 2)
+
+ %or1:_(s64) = G_OR %byte0_byte1, %byte2_byte3
+ %or2:_(s64) = G_OR %byte4_byte5, %byte6_byte7
+ %full_load:_(s64) = G_OR %or1, %or2
+
+ $x1 = COPY %full_load(s64)
+ RET_ReallyLR implicit $x1
+
+
+...
+---
+name: nonzero_start_idx_positive_little_endian_pat
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; s8* x = ...
+ ; s32 y = (x[1] | (x[2] << 8)) | ((x[3] << 16) | (x[4] << 24))
+ ;
+ ; -> Little endian: Load from x[1]
+ ; -> Big endian: Load from x[1] + BSWAP
+
+ ; LITTLE-LABEL: name: nonzero_start_idx_positive_little_endian_pat
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_1:_(s32) = G_CONSTANT i32 1
+ ; LITTLE: %ptr:_(p0) = COPY $x0
+ ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ ; LITTLE: %full_load:_(s32) = G_LOAD %ptr_elt_1(p0) :: (load 4, align 1)
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: nonzero_start_idx_positive_little_endian_pat
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_1:_(s32) = G_CONSTANT i32 1
+ ; BIG: %ptr:_(p0) = COPY $x0
+ ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr_elt_1(p0) :: (load 4, align 1)
+ ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]]
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s32) = G_CONSTANT i32 1
+ %cst_2:_(s32) = G_CONSTANT i32 2
+ %cst_3:_(s32) = G_CONSTANT i32 3
+ %cst_4:_(s32) = G_CONSTANT i32 4
+
+ %cst_8:_(s32) = G_CONSTANT i32 8
+ %cst_16:_(s32) = G_CONSTANT i32 16
+ %cst_24:_(s32) = G_CONSTANT i32 24
+
+ %ptr:_(p0) = COPY $x0
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+ %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+ %ptr_elt_4:_(p0) = G_PTR_ADD %ptr, %cst_4(s32)
+
+ %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+ %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+ %elt4:_(s32) = G_ZEXTLOAD %ptr_elt_4(p0) :: (load 1)
+
+ %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ %byte1:_(s32) = nuw G_SHL %elt2, %cst_8(s32)
+ %byte2:_(s32) = nuw G_SHL %elt3, %cst_16(s32)
+ %byte3:_(s32) = nuw G_SHL %elt4, %cst_24(s32)
+
+ %or1:_(s32) = G_OR %byte0, %byte1
+ %or2:_(s32) = G_OR %byte2, %byte3
+ %full_load:_(s32) = G_OR %or1, %or2
+
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: nonzero_start_idx_positive_big_endian_pat
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; s8* x = ...
+ ; s32 y = (x[4] | (x[3] << 8)) | ((x[2] << 16) | (x[1] << 24))
+ ;
+ ; -> Little endian: Load from x[1] + BSWAP
+ ; -> Big endian: Load from x[1]
+
+ ; LITTLE-LABEL: name: nonzero_start_idx_positive_big_endian_pat
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_1:_(s32) = G_CONSTANT i32 1
+ ; LITTLE: %ptr:_(p0) = COPY $x0
+ ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ ; LITTLE: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr_elt_1(p0) :: (load 4, align 1)
+ ; LITTLE: %full_load:_(s32) = G_BSWAP [[LOAD]]
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: nonzero_start_idx_positive_big_endian_pat
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_1:_(s32) = G_CONSTANT i32 1
+ ; BIG: %ptr:_(p0) = COPY $x0
+ ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ ; BIG: %full_load:_(s32) = G_LOAD %ptr_elt_1(p0) :: (load 4, align 1)
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s32) = G_CONSTANT i32 1
+ %cst_2:_(s32) = G_CONSTANT i32 2
+ %cst_3:_(s32) = G_CONSTANT i32 3
+ %cst_4:_(s32) = G_CONSTANT i32 4
+
+ %cst_8:_(s32) = G_CONSTANT i32 8
+ %cst_16:_(s32) = G_CONSTANT i32 16
+ %cst_24:_(s32) = G_CONSTANT i32 24
+
+ %ptr:_(p0) = COPY $x0
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+ %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+ %ptr_elt_4:_(p0) = G_PTR_ADD %ptr, %cst_4(s32)
+
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+ %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+
+ %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_4(p0) :: (load 1)
+ %byte1:_(s32) = nuw G_SHL %elt3, %cst_8(s32)
+ %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+ %byte3:_(s32) = nuw G_SHL %elt1, %cst_24(s32)
+
+ %or1:_(s32) = G_OR %byte0, %byte1
+ %or2:_(s32) = G_OR %byte2, %byte3
+ %full_load:_(s32) = G_OR %or1, %or2
+
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: nonzero_start_idx_negative_little_endian_pat
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; s8* x = ...
+ ; s32 y = (x[-3] | (x[-2] << 8)) | ((x[-1] << 16) | (x[0] << 24))
+ ;
+ ; -> Little endian: Load from x[-3]
+ ; -> Big endian: Load from x[-3] + BSWAP
+
+ ; LITTLE-LABEL: name: nonzero_start_idx_negative_little_endian_pat
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_neg_3:_(s32) = G_CONSTANT i32 -3
+ ; LITTLE: %ptr:_(p0) = COPY $x0
+ ; LITTLE: %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32)
+ ; LITTLE: %full_load:_(s32) = G_LOAD %ptr_elt_neg_3(p0) :: (load 4, align 1)
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: nonzero_start_idx_negative_little_endian_pat
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_neg_3:_(s32) = G_CONSTANT i32 -3
+ ; BIG: %ptr:_(p0) = COPY $x0
+ ; BIG: %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32)
+ ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr_elt_neg_3(p0) :: (load 4, align 1)
+ ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]]
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_neg_1:_(s32) = G_CONSTANT i32 -1
+ %cst_neg_2:_(s32) = G_CONSTANT i32 -2
+ %cst_neg_3:_(s32) = G_CONSTANT i32 -3
+
+ %cst_8:_(s32) = G_CONSTANT i32 8
+ %cst_16:_(s32) = G_CONSTANT i32 16
+ %cst_24:_(s32) = G_CONSTANT i32 24
+
+ %ptr:_(p0) = COPY $x0
+ %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32)
+ %ptr_elt_neg_2:_(p0) = G_PTR_ADD %ptr, %cst_neg_2(s32)
+ %ptr_elt_neg_1:_(p0) = G_PTR_ADD %ptr, %cst_neg_1(s32)
+
+ %elt_neg_2:_(s32) = G_ZEXTLOAD %ptr_elt_neg_2(p0) :: (load 1)
+ %elt_neg_1:_(s32) = G_ZEXTLOAD %ptr_elt_neg_1(p0) :: (load 1)
+ %elt_0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+
+ %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_neg_3(p0) :: (load 1)
+ %byte1:_(s32) = nuw G_SHL %elt_neg_2, %cst_8(s32)
+ %byte2:_(s32) = nuw G_SHL %elt_neg_1, %cst_16(s32)
+ %byte3:_(s32) = nuw G_SHL %elt_0, %cst_24(s32)
+
+ %or1:_(s32) = G_OR %byte0, %byte1
+ %or2:_(s32) = G_OR %byte2, %byte3
+ %full_load:_(s32) = G_OR %or1, %or2
+
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: nonzero_start_idx_negative_big_endian_pat
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; s8* x = ...
+ ; s32 y = (x[0] | (x[-1] << 8)) | ((x[-2] << 16) | (x[-3] << 24))
+ ;
+ ; -> Little endian: Load from x[-3] + BSWAP
+ ; -> Big endian: Load from x[-3]
+
+ ; LITTLE-LABEL: name: nonzero_start_idx_negative_big_endian_pat
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_neg_3:_(s32) = G_CONSTANT i32 -3
+ ; LITTLE: %ptr:_(p0) = COPY $x0
+ ; LITTLE: %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32)
+ ; LITTLE: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr_elt_neg_3(p0) :: (load 4, align 1)
+ ; LITTLE: %full_load:_(s32) = G_BSWAP [[LOAD]]
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: nonzero_start_idx_negative_big_endian_pat
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_neg_3:_(s32) = G_CONSTANT i32 -3
+ ; BIG: %ptr:_(p0) = COPY $x0
+ ; BIG: %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32)
+ ; BIG: %full_load:_(s32) = G_LOAD %ptr_elt_neg_3(p0) :: (load 4, align 1)
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_neg_1:_(s32) = G_CONSTANT i32 -1
+ %cst_neg_2:_(s32) = G_CONSTANT i32 -2
+ %cst_neg_3:_(s32) = G_CONSTANT i32 -3
+
+ %cst_8:_(s32) = G_CONSTANT i32 8
+ %cst_16:_(s32) = G_CONSTANT i32 16
+ %cst_24:_(s32) = G_CONSTANT i32 24
+
+ %ptr:_(p0) = COPY $x0
+ %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32)
+ %ptr_elt_neg_2:_(p0) = G_PTR_ADD %ptr, %cst_neg_2(s32)
+ %ptr_elt_neg_1:_(p0) = G_PTR_ADD %ptr, %cst_neg_1(s32)
+
+ %elt_neg_3:_(s32) = G_ZEXTLOAD %ptr_elt_neg_3(p0) :: (load 1)
+ %elt_neg_2:_(s32) = G_ZEXTLOAD %ptr_elt_neg_2(p0) :: (load 1)
+ %elt_neg_1:_(s32) = G_ZEXTLOAD %ptr_elt_neg_1(p0) :: (load 1)
+ %elt_0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+
+ %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+ %byte1:_(s32) = nuw G_SHL %elt_neg_1, %cst_8(s32)
+ %byte2:_(s32) = nuw G_SHL %elt_neg_2, %cst_16(s32)
+ %byte3:_(s32) = nuw G_SHL %elt_neg_3, %cst_24(s32)
+
+ %or1:_(s32) = G_OR %byte0, %byte1
+ %or2:_(s32) = G_OR %byte2, %byte3
+ %full_load:_(s32) = G_OR %or1, %or2
+
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: dont_combine_volatile
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; Combine should only happen with unordered loads.
+
+ ; LITTLE-LABEL: name: dont_combine_volatile
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (volatile load 2)
+ ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+ ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: dont_combine_volatile
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (volatile load 2)
+ ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+ ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_16:_(s32) = G_CONSTANT i32 16
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+ %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (volatile load 2)
+ %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+ %full_load:_(s32) = G_OR %low_half, %high_half
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: dont_wrong_memop_size
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; Combine should only happen when the loads load the same size.
+
+ ; LITTLE-LABEL: name: dont_wrong_memop_size
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; LITTLE: %wrong_size_load:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ ; LITTLE: %high_half:_(s32) = nuw G_SHL %wrong_size_load, %cst_16(s32)
+ ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: dont_wrong_memop_size
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; BIG: %wrong_size_load:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ ; BIG: %high_half:_(s32) = nuw G_SHL %wrong_size_load, %cst_16(s32)
+ ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_16:_(s32) = G_CONSTANT i32 16
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+ %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ %wrong_size_load:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ %high_half:_(s32) = nuw G_SHL %wrong_size_load, %cst_16(s32)
+
+ %full_load:_(s32) = G_OR %low_half, %high_half
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: dont_combine_wrong_offset
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; This is not equivalent to a 32-bit load with/without a BSWAP:
+ ;
+ ; s16 *x = ...
+ ; s32 y = x[0] | (x[1] << 24)
+
+ ; LITTLE-LABEL: name: dont_combine_wrong_offset
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; LITTLE: %cst_24:_(s32) = G_CONSTANT i32 24
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_24(s32)
+ ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: dont_combine_wrong_offset
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; BIG: %cst_24:_(s32) = G_CONSTANT i32 24
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_24(s32)
+ ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_24:_(s32) = G_CONSTANT i32 24
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+ %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ %high_half:_(s32) = nuw G_SHL %elt1, %cst_24(s32)
+
+ %full_load:_(s32) = G_OR %low_half, %high_half
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: dont_combine_wrong_offset_2
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; This does not correspond to a 32-bit load with/without a BSWAP:
+ ;
+ ; s16 *x = ...
+ ; s32 y = x[0] | (x[1] << 8)
+
+ ; LITTLE-LABEL: name: dont_combine_wrong_offset_2
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; LITTLE: %cst_8:_(s32) = G_CONSTANT i32 8
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+ ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: dont_combine_wrong_offset_2
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; BIG: %cst_8:_(s32) = G_CONSTANT i32 8
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+ ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_8:_(s32) = G_CONSTANT i32 8
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+ %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ %high_half:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+
+ %full_load:_(s32) = G_OR %low_half, %high_half
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: dont_combine_missing_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; This is missing x[2], so we shouldn't combine:
+ ;
+ ; s16 *x = ...
+ ; s64 y = (x[0] | (x[1] << 16)) | (x[3] << 48)
+
+ ; LITTLE-LABEL: name: dont_combine_missing_load
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; LITTLE: %cst_3:_(s64) = G_CONSTANT i64 3
+ ; LITTLE: %cst_16:_(s64) = G_CONSTANT i64 16
+ ; LITTLE: %cst_48:_(s64) = G_CONSTANT i64 48
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; LITTLE: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s64)
+ ; LITTLE: %byte0_byte1:_(s64) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; LITTLE: %elt1:_(s64) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; LITTLE: %elt3:_(s64) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 2)
+ ; LITTLE: %byte2_byte3:_(s64) = nuw G_SHL %elt1, %cst_16(s64)
+ ; LITTLE: %byte6_byte7:_(s64) = nuw G_SHL %elt3, %cst_48(s64)
+ ; LITTLE: %or1:_(s64) = G_OR %byte0_byte1, %byte2_byte3
+ ; LITTLE: %full_load:_(s64) = G_OR %or1, %byte6_byte7
+ ; LITTLE: $x1 = COPY %full_load(s64)
+ ; LITTLE: RET_ReallyLR implicit $x1
+ ; BIG-LABEL: name: dont_combine_missing_load
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; BIG: %cst_3:_(s64) = G_CONSTANT i64 3
+ ; BIG: %cst_16:_(s64) = G_CONSTANT i64 16
+ ; BIG: %cst_48:_(s64) = G_CONSTANT i64 48
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; BIG: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s64)
+ ; BIG: %byte0_byte1:_(s64) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; BIG: %elt1:_(s64) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; BIG: %elt3:_(s64) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 2)
+ ; BIG: %byte2_byte3:_(s64) = nuw G_SHL %elt1, %cst_16(s64)
+ ; BIG: %byte6_byte7:_(s64) = nuw G_SHL %elt3, %cst_48(s64)
+ ; BIG: %or1:_(s64) = G_OR %byte0_byte1, %byte2_byte3
+ ; BIG: %full_load:_(s64) = G_OR %or1, %byte6_byte7
+ ; BIG: $x1 = COPY %full_load(s64)
+ ; BIG: RET_ReallyLR implicit $x1
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_3:_(s64) = G_CONSTANT i64 3
+
+ %cst_16:_(s64) = G_CONSTANT i64 16
+ %cst_48:_(s64) = G_CONSTANT i64 48
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s64)
+
+ %byte0_byte1:_(s64) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+
+ %elt1:_(s64) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ %elt3:_(s64) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 2)
+
+ %byte2_byte3:_(s64) = nuw G_SHL %elt1, %cst_16(s64)
+ %byte6_byte7:_(s64) = nuw G_SHL %elt3, %cst_48(s64)
+
+ %or1:_(s64) = G_OR %byte0_byte1, %byte2_byte3
+ %full_load:_(s64) = G_OR %or1, %byte6_byte7
+
+ $x1 = COPY %full_load(s64)
+ RET_ReallyLR implicit $x1
+
+...
+---
+name: dont_combine_
diff erent_addr_spaces
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; When the loads are from
diff erent address spaces, don't combine.
+
+ ; LITTLE-LABEL: name: dont_combine_
diff erent_addr_spaces
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, addrspace 1)
+ ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+ ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: dont_combine_
diff erent_addr_spaces
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, addrspace 1)
+ ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+ ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_16:_(s32) = G_CONSTANT i32 16
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+ %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2, addrspace 0)
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, addrspace 1)
+ %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+ %full_load:_(s32) = G_OR %low_half, %high_half
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: dont_combine_duplicate_idx
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; If two of the G_PTR_ADDs have the same index, then don't combine.
+ ;
+ ; sN *x = ...
+ ; sM y = (x[i] << A) | (x[i] << B) ...
+
+ ; LITTLE-LABEL: name: dont_combine_duplicate_idx
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_1:_(s32) = G_CONSTANT i32 1
+ ; LITTLE: %reused_idx:_(s32) = G_CONSTANT i32 2
+ ; LITTLE: %cst_8:_(s32) = G_CONSTANT i32 8
+ ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; LITTLE: %cst_24:_(s32) = G_CONSTANT i32 24
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ ; LITTLE: %uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32)
+ ; LITTLE: %also_uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32)
+ ; LITTLE: %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+ ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ ; LITTLE: %elt2:_(s32) = G_ZEXTLOAD %uses_idx_2(p0) :: (load 1)
+ ; LITTLE: %elt3:_(s32) = G_ZEXTLOAD %also_uses_idx_2(p0) :: (load 1)
+ ; LITTLE: %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+ ; LITTLE: %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+ ; LITTLE: %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32)
+ ; LITTLE: %or1:_(s32) = G_OR %byte0, %byte1
+ ; LITTLE: %or2:_(s32) = G_OR %byte2, %byte3
+ ; LITTLE: %full_load:_(s32) = G_OR %or1, %or2
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: dont_combine_duplicate_idx
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_1:_(s32) = G_CONSTANT i32 1
+ ; BIG: %reused_idx:_(s32) = G_CONSTANT i32 2
+ ; BIG: %cst_8:_(s32) = G_CONSTANT i32 8
+ ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; BIG: %cst_24:_(s32) = G_CONSTANT i32 24
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ ; BIG: %uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32)
+ ; BIG: %also_uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32)
+ ; BIG: %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+ ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ ; BIG: %elt2:_(s32) = G_ZEXTLOAD %uses_idx_2(p0) :: (load 1)
+ ; BIG: %elt3:_(s32) = G_ZEXTLOAD %also_uses_idx_2(p0) :: (load 1)
+ ; BIG: %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+ ; BIG: %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+ ; BIG: %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32)
+ ; BIG: %or1:_(s32) = G_OR %byte0, %byte1
+ ; BIG: %or2:_(s32) = G_OR %byte2, %byte3
+ ; BIG: %full_load:_(s32) = G_OR %or1, %or2
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s32) = G_CONSTANT i32 1
+ %reused_idx:_(s32) = G_CONSTANT i32 2
+
+ %cst_8:_(s32) = G_CONSTANT i32 8
+ %cst_16:_(s32) = G_CONSTANT i32 16
+ %cst_24:_(s32) = G_CONSTANT i32 24
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ %uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32)
+ %also_uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32)
+
+ %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ %elt2:_(s32) = G_ZEXTLOAD %uses_idx_2(p0) :: (load 1)
+ %elt3:_(s32) = G_ZEXTLOAD %also_uses_idx_2(p0) :: (load 1)
+
+ %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+ %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+ %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32)
+
+ %or1:_(s32) = G_OR %byte0, %byte1
+ %or2:_(s32) = G_OR %byte2, %byte3
+ %full_load:_(s32) = G_OR %or1, %or2
+
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+...
+---
+name: dont_combine_duplicate_offset
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; If two of the G_SHLs have the same constant, then we should not combine.
+ ;
+ ; sN *x = ...
+ ; sM y = (x[i] << A) | (x[i+1] << A) ...
+
+ ; LITTLE-LABEL: name: dont_combine_duplicate_offset
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_1:_(s32) = G_CONSTANT i32 1
+ ; LITTLE: %cst_2:_(s32) = G_CONSTANT i32 2
+ ; LITTLE: %cst_3:_(s32) = G_CONSTANT i32 3
+ ; LITTLE: %cst_8:_(s32) = G_CONSTANT i32 8
+ ; LITTLE: %duplicate_shl_cst:_(s32) = G_CONSTANT i32 16
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ ; LITTLE: %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+ ; LITTLE: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+ ; LITTLE: %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+ ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ ; LITTLE: %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+ ; LITTLE: %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+ ; LITTLE: %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+ ; LITTLE: %duplicate_shl_1:_(s32) = nuw G_SHL %elt2, %duplicate_shl_cst(s32)
+ ; LITTLE: %duplicate_shl_2:_(s32) = nuw G_SHL %elt3, %duplicate_shl_cst(s32)
+ ; LITTLE: %or1:_(s32) = G_OR %byte0, %byte1
+ ; LITTLE: %or2:_(s32) = G_OR %duplicate_shl_1, %duplicate_shl_2
+ ; LITTLE: %full_load:_(s32) = G_OR %or1, %or2
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: dont_combine_duplicate_offset
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_1:_(s32) = G_CONSTANT i32 1
+ ; BIG: %cst_2:_(s32) = G_CONSTANT i32 2
+ ; BIG: %cst_3:_(s32) = G_CONSTANT i32 3
+ ; BIG: %cst_8:_(s32) = G_CONSTANT i32 8
+ ; BIG: %duplicate_shl_cst:_(s32) = G_CONSTANT i32 16
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ ; BIG: %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+ ; BIG: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+ ; BIG: %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+ ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ ; BIG: %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+ ; BIG: %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+ ; BIG: %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+ ; BIG: %duplicate_shl_1:_(s32) = nuw G_SHL %elt2, %duplicate_shl_cst(s32)
+ ; BIG: %duplicate_shl_2:_(s32) = nuw G_SHL %elt3, %duplicate_shl_cst(s32)
+ ; BIG: %or1:_(s32) = G_OR %byte0, %byte1
+ ; BIG: %or2:_(s32) = G_OR %duplicate_shl_1, %duplicate_shl_2
+ ; BIG: %full_load:_(s32) = G_OR %or1, %or2
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s32) = G_CONSTANT i32 1
+ %cst_2:_(s32) = G_CONSTANT i32 2
+ %cst_3:_(s32) = G_CONSTANT i32 3
+
+ %cst_8:_(s32) = G_CONSTANT i32 8
+ %duplicate_shl_cst:_(s32) = G_CONSTANT i32 16
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+ %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+
+ %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+ %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+
+ %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+ %duplicate_shl_1:_(s32) = nuw G_SHL %elt2, %duplicate_shl_cst(s32)
+ %duplicate_shl_2:_(s32) = nuw G_SHL %elt3, %duplicate_shl_cst(s32)
+
+ %or1:_(s32) = G_OR %byte0, %byte1
+ %or2:_(s32) = G_OR %duplicate_shl_1, %duplicate_shl_2
+ %full_load:_(s32) = G_OR %or1, %or2
+
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: dont_combine_lowest_index_not_zero_offset
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; In this case, the lowest index load (e.g. x[0]) does not end up at byte
+ ; offset 0. We shouldn't combine.
+ ;
+ ; s8 *x = ...
+ ; s32 y = (x[0] << 8) | (x[1]) | (x[2] << 16) ...
+
+ ; LITTLE-LABEL: name: dont_combine_lowest_index_not_zero_offset
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_1:_(s32) = G_CONSTANT i32 1
+ ; LITTLE: %cst_2:_(s32) = G_CONSTANT i32 2
+ ; LITTLE: %cst_3:_(s32) = G_CONSTANT i32 3
+ ; LITTLE: %cst_8:_(s32) = G_CONSTANT i32 8
+ ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; LITTLE: %cst_24:_(s32) = G_CONSTANT i32 24
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ ; LITTLE: %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+ ; LITTLE: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+ ; LITTLE: %lowest_idx_load:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+ ; LITTLE: %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ ; LITTLE: %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+ ; LITTLE: %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+ ; LITTLE: %byte1:_(s32) = nuw G_SHL %lowest_idx_load, %cst_8(s32)
+ ; LITTLE: %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+ ; LITTLE: %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32)
+ ; LITTLE: %or1:_(s32) = G_OR %byte0, %byte1
+ ; LITTLE: %or2:_(s32) = G_OR %byte2, %byte3
+ ; LITTLE: %full_load:_(s32) = G_OR %or1, %or2
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: dont_combine_lowest_index_not_zero_offset
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_1:_(s32) = G_CONSTANT i32 1
+ ; BIG: %cst_2:_(s32) = G_CONSTANT i32 2
+ ; BIG: %cst_3:_(s32) = G_CONSTANT i32 3
+ ; BIG: %cst_8:_(s32) = G_CONSTANT i32 8
+ ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; BIG: %cst_24:_(s32) = G_CONSTANT i32 24
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ ; BIG: %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+ ; BIG: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+ ; BIG: %lowest_idx_load:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+ ; BIG: %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ ; BIG: %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+ ; BIG: %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+ ; BIG: %byte1:_(s32) = nuw G_SHL %lowest_idx_load, %cst_8(s32)
+ ; BIG: %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+ ; BIG: %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32)
+ ; BIG: %or1:_(s32) = G_OR %byte0, %byte1
+ ; BIG: %or2:_(s32) = G_OR %byte2, %byte3
+ ; BIG: %full_load:_(s32) = G_OR %or1, %or2
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s32) = G_CONSTANT i32 1
+ %cst_2:_(s32) = G_CONSTANT i32 2
+ %cst_3:_(s32) = G_CONSTANT i32 3
+
+ %cst_8:_(s32) = G_CONSTANT i32 8
+ %cst_16:_(s32) = G_CONSTANT i32 16
+ %cst_24:_(s32) = G_CONSTANT i32 24
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+ %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+ %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+
+ ; This load is index 0
+ %lowest_idx_load:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+ %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+ %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+ %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+
+ ; ... But it ends up being shifted, so we shouldn't combine.
+ %byte1:_(s32) = nuw G_SHL %lowest_idx_load, %cst_8(s32)
+ %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+ %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32)
+
+ %or1:_(s32) = G_OR %byte0, %byte1
+ %or2:_(s32) = G_OR %byte2, %byte3
+ %full_load:_(s32) = G_OR %or1, %or2
+
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: dont_combine_more_than_one_use_load
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; If any load is used more than once, don't combine. We want to remove the
+ ; entire tree.
+
+ ; LITTLE-LABEL: name: dont_combine_more_than_one_use_load
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+ ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; LITTLE: %extra_use:_(s32) = G_AND %full_load, %low_half
+ ; LITTLE: $w1 = COPY %extra_use(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: dont_combine_more_than_one_use_load
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+ ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; BIG: %extra_use:_(s32) = G_AND %full_load, %low_half
+ ; BIG: $w1 = COPY %extra_use(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_16:_(s32) = G_CONSTANT i32 16
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+ %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+ %full_load:_(s32) = G_OR %low_half, %high_half
+ %extra_use:_(s32) = G_AND %full_load, %low_half
+ $w1 = COPY %extra_use(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: dont_combine_more_than_one_use_shl
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+
+ ; If anything feeding into any of the ors is used more than once, don't
+ ; combine.
+
+ ; LITTLE-LABEL: name: dont_combine_more_than_one_use_shl
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+ ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; LITTLE: %extra_use:_(s32) = G_AND %full_load, %high_half
+ ; LITTLE: $w1 = COPY %extra_use(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: dont_combine_more_than_one_use_shl
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+ ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; BIG: %extra_use:_(s32) = G_AND %full_load, %high_half
+ ; BIG: $w1 = COPY %extra_use(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_16:_(s32) = G_CONSTANT i32 16
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+ %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+ %full_load:_(s32) = G_OR %low_half, %high_half
+ %extra_use:_(s32) = G_AND %full_load, %high_half
+ $w1 = COPY %extra_use(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: dont_combine_store_between_same_mbb
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+ ; If there is a store between any of the loads, then do not combine.
+
+ ; LITTLE-LABEL: name: dont_combine_store_between_same_mbb
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; LITTLE: %other_ptr:_(p0) = COPY $x1
+ ; LITTLE: %some_val:_(s32) = G_CONSTANT i32 12
+ ; LITTLE: G_STORE %some_val(s32), %other_ptr(p0) :: (store 2)
+ ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+ ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: dont_combine_store_between_same_mbb
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; BIG: %other_ptr:_(p0) = COPY $x1
+ ; BIG: %some_val:_(s32) = G_CONSTANT i32 12
+ ; BIG: G_STORE %some_val(s32), %other_ptr(p0) :: (store 2)
+ ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+ ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_16:_(s32) = G_CONSTANT i32 16
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+
+ ; Memory could be modified here, so don't combine!
+ %other_ptr:_(p0) = COPY $x1
+ %some_val:_(s32) = G_CONSTANT i32 12
+ G_STORE %some_val, %other_ptr :: (store 2)
+
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+ %full_load:_(s32) = G_OR %low_half, %high_half
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: dont_combine_store_between_
diff erent_mbb
+tracksRegLiveness: true
+body: |
+ ; There is a store between the two loads, hidden away in a
diff erent MBB.
+ ; We should not combine here.
+
+ ; LITTLE-LABEL: name: dont_combine_store_between_
diff erent_mbb
+ ; LITTLE: bb.0:
+ ; LITTLE: successors: %bb.1(0x80000000)
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; LITTLE: bb.1:
+ ; LITTLE: successors: %bb.2(0x80000000)
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %other_ptr:_(p0) = COPY $x1
+ ; LITTLE: %some_val:_(s32) = G_CONSTANT i32 12
+ ; LITTLE: G_STORE %some_val(s32), %other_ptr(p0) :: (store 2)
+ ; LITTLE: bb.2:
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+ ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: dont_combine_store_between_
diff erent_mbb
+ ; BIG: bb.0:
+ ; BIG: successors: %bb.1(0x80000000)
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; BIG: bb.1:
+ ; BIG: successors: %bb.2(0x80000000)
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %other_ptr:_(p0) = COPY $x1
+ ; BIG: %some_val:_(s32) = G_CONSTANT i32 12
+ ; BIG: G_STORE %some_val(s32), %other_ptr(p0) :: (store 2)
+ ; BIG: bb.2:
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+ ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $x0, $x1
+ ; If there is a store between any of the loads, then do not combine.
+
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_16:_(s32) = G_CONSTANT i32 16
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+
+ bb.1:
+ liveins: $x0, $x1
+ successors: %bb.2(0x80000000)
+ ; Memory could be modified here, so don't combine!
+ %other_ptr:_(p0) = COPY $x1
+ %some_val:_(s32) = G_CONSTANT i32 12
+ G_STORE %some_val, %other_ptr :: (store 2)
+
+ bb.2:
+ liveins: $x0, $x1
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+ %full_load:_(s32) = G_OR %low_half, %high_half
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name:
diff erent_mbb
+tracksRegLiveness: true
+body: |
+ ; It should be possible to combine here, but it's not supported right now.
+
+ ; LITTLE-LABEL: name:
diff erent_mbb
+ ; LITTLE: bb.0:
+ ; LITTLE: successors: %bb.1(0x80000000)
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; LITTLE: bb.1:
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+ ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name:
diff erent_mbb
+ ; BIG: bb.0:
+ ; BIG: successors: %bb.1(0x80000000)
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+ ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ ; BIG: bb.1:
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+ ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+
+ bb.0:
+ successors: %bb.1(0x80000000)
+ liveins: $x0, $x1
+
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_16:_(s32) = G_CONSTANT i32 16
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+ %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+
+ bb.1:
+ liveins: $x0, $x1
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+ %full_load:_(s32) = G_OR %low_half, %high_half
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
+
+...
+---
+name: load_first
+tracksRegLiveness: true
+body: |
+ bb.0:
+ liveins: $x0, $x1
+ ; Test for a bug fix for predecessor-checking code.
+
+ ; LITTLE-LABEL: name: load_first
+ ; LITTLE: liveins: $x0, $x1
+ ; LITTLE: %ptr:_(p0) = COPY $x1
+ ; LITTLE: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2)
+ ; LITTLE: $w1 = COPY %full_load(s32)
+ ; LITTLE: RET_ReallyLR implicit $w1
+ ; BIG-LABEL: name: load_first
+ ; BIG: liveins: $x0, $x1
+ ; BIG: %ptr:_(p0) = COPY $x1
+ ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2)
+ ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]]
+ ; BIG: $w1 = COPY %full_load(s32)
+ ; BIG: RET_ReallyLR implicit $w1
+ %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+ %cst_1:_(s64) = G_CONSTANT i64 1
+ %cst_16:_(s32) = G_CONSTANT i32 16
+
+ %ptr:_(p0) = COPY $x1
+ %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+ %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+ %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+ %full_load:_(s32) = G_OR %low_half, %high_half
+ $w1 = COPY %full_load(s32)
+ RET_ReallyLR implicit $w1
More information about the llvm-branch-commits
mailing list