[llvm-branch-commits] [llvm] cfc6073 - [GlobalISel] Combine (a[0]) | (a[1] << k1) | ...| (a[m] << kn) into a wide load

Tue Jan 19 10:28:58 PST 2021

Author: Jessica Paquette
Date: 2021-01-19T10:24:27-08:00
New Revision: cfc60730179042a93cb9cb338982e71d20707a24

URL: https://github.com/llvm/llvm-project/commit/cfc60730179042a93cb9cb338982e71d20707a24
DIFF: https://github.com/llvm/llvm-project/commit/cfc60730179042a93cb9cb338982e71d20707a24.diff

LOG: [GlobalISel] Combine (a[0]) | (a[1] << k1) | ...|  (a[m] << kn) into a wide load

This is a restricted version of the combine in `DAGCombiner::MatchLoadCombine`.
(See D27861)

This tries to recognize patterns like below (assuming a little-endian target):

```
s8* x = ...
s32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
->
s32 val = *((i32)a)

s8* x = ...
s32 val = a[3] | (a[2] << 8) | (a[1] << 16) | (a[0] << 24)
->
s32 val = BSWAP(*((s32)a))
```

(This patch also handles the big-endian target case as well, in which the first
example above has a BSWAP, and the second example above does not.)

To recognize the pattern, this searches from the last G_OR in the expression
tree.

E.g.

```
    Reg   Reg
     \    /
      OR_1   Reg
       \    /
        OR_2
          \     Reg
           .. /
          Root
```

Each non-OR register in the tree is put in a list. Each register in the list is
then checked to see if it's an appropriate load + shift logic.

If every register is a load + potentially a shift, the combine checks if those
loads + shifts, when OR'd together, are equivalent to a wide load (possibly with
a BSWAP.)

To simplify things, this patch

(1) Only handles G_ZEXTLOADs (which appear to be the common case)
(2) Only works in a single MachineBasicBlock
(3) Only handles G_SHL as the bit twiddling to stick the small load into a
    specific location

An IR example of this is here: https://godbolt.org/z/4sP9Pj (lifted from
test/CodeGen/AArch64/load-combine.ll)

At -Os on AArch64, this is a 0.5% code size improvement for CTMark/sqlite3,
and a 0.4% improvement for CTMark/7zip-benchmark.

Also fix a bug in `isPredecessor` which caused it to fail whenever `DefMI` was
the first instruction in the block.

Differential Revision: https://reviews.llvm.org/D94350

Added: 
    llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern-align.mir
    llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern.mir

Modified: 
    llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
    llvm/include/llvm/CodeGen/TargetLowering.h
    llvm/include/llvm/Target/GlobalISel/Combine.td
    llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
    llvm/lib/CodeGen/TargetLoweringBase.cpp

Removed: 
    


################################################################################
diff  --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 0d240e90820f..8570f5ca5dd5 100644

--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -18,6 +18,7 @@
 #define LLVM_CODEGEN_GLOBALISEL_COMBINER_HELPER_H
 
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/Support/Alignment.h"
@@ -471,6 +472,20 @@ class CombinerHelper {
   bool applyCombineInsertVecElts(MachineInstr &MI,
                              SmallVectorImpl<Register> &MatchInfo);
 
+  /// Match expression trees of the form
+  ///
+  /// \code
+  ///  sN *a = ...
+  ///  sM val = a[0] | (a[1] << N) | (a[2] << 2N) | (a[3] << 3N) ...
+  /// \endcode
+  ///
+  /// And check if the tree can be replaced with a M-bit load + possibly a
+  /// bswap.
+  bool matchLoadOrCombine(MachineInstr &MI,
+                          std::function<void(MachineIRBuilder &)> &MatchInfo);
+  bool applyLoadOrCombine(MachineInstr &MI,
+                          std::function<void(MachineIRBuilder &)> &MatchInfo);
+
   /// Try to transform \p MI by using all of the above
   /// combine functions. Returns true if changed.
   bool tryCombine(MachineInstr &MI);
@@ -499,6 +514,30 @@ class CombinerHelper {
   /// \returns true if a candidate is found.
   bool findPreIndexCandidate(MachineInstr &MI, Register &Addr, Register &Base,
                              Register &Offset);
+
+  /// Helper function for matchLoadOrCombine. Searches for Registers
+  /// which may have been produced by a load instruction + some arithmetic.
+  ///
+  /// \param [in] Root - The search root.
+  ///
+  /// \returns The Registers found during the search.
+  Optional<SmallVector<Register, 8>>
+  findCandidatesForLoadOrCombine(const MachineInstr *Root) const;
+
+  /// Helper function for matchLoadOrCombine.
+  ///
+  /// Checks if every register in \p RegsToVisit is defined by a load
+  /// instruction + some arithmetic.
+  ///
+  /// \param [out] MemOffset2Idx - Maps the byte positions each load ends up
+  /// at to the index of the load.
+  /// \param [in] MemSizeInBits - The number of bits each load should produce.
+  ///
+  /// \returns The lowest-index load found and the lowest index on success.
+  Optional<std::pair<MachineInstr *, int64_t>> findLoadOffsetsForLoadOrCombine(
+      SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
+      const SmallVector<Register, 8> &RegsToVisit,
+      const unsigned MemSizeInBits);
 };
 } // namespace llvm
 

diff  --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 305107c48750..5a237074a5a3 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -1658,6 +1658,11 @@ class TargetLoweringBase {
                           const MachineMemOperand &MMO,
                           bool *Fast = nullptr) const;
 
+  /// LLT handling variant.
+  bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, LLT Ty,
+                          const MachineMemOperand &MMO,
+                          bool *Fast = nullptr) const;
+
   /// Returns the target specific optimal type for load and store operations as
   /// a result of memset, memcpy, and memmove lowering.
   /// It returns EVT::Other if the type should be determined using generic

diff  --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index e352e499d47c..e2c7a90a1b16 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -545,6 +545,14 @@ def combine_insert_vec_elts_build_vector : GICombineRule<
     [{ return Helper.matchCombineInsertVecElts(*${root}, ${info}); }]),
   (apply [{ return Helper.applyCombineInsertVecElts(*${root}, ${info}); }])>;
 
+def load_or_combine_matchdata :
+GIDefMatchData<"std::function<void(MachineIRBuilder &)>">;
+def load_or_combine : GICombineRule<
+  (defs root:$root, load_or_combine_matchdata:$info),
+  (match (wip_match_opcode G_OR):$root,
+    [{ return Helper.matchLoadOrCombine(*${root}, ${info}); }]),
+  (apply [{ return Helper.applyLoadOrCombine(*${root}, ${info}); }])>;
+
 // Currently only the one combine above.
 def insert_vec_elt_combines : GICombineGroup<
                             [combine_insert_vec_elts_build_vector]>;
@@ -587,4 +595,4 @@ def all_combines : GICombineGroup<[trivial_combines, insert_vec_elt_combines,
     unmerge_merge, fabs_fabs_fold, unmerge_cst, unmerge_dead_to_trunc,
     unmerge_zext_to_zext, trunc_ext_fold, trunc_shl,
     const_combines, xor_of_and_with_same_reg, ptr_add_with_zero,
-    shift_immed_chain, shift_of_shifted_logic_chain]>;
+    shift_immed_chain, shift_of_shifted_logic_chain, load_or_combine]>;

diff  --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index bbcf32a73fe0..c142c7a70c95 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -48,6 +48,66 @@ const TargetLowering &CombinerHelper::getTargetLowering() const {
   return *Builder.getMF().getSubtarget().getTargetLowering();
 }
 
+/// \returns The little endian in-memory byte position of byte \p I in a
+/// \p ByteWidth bytes wide type.
+///
+/// E.g. Given a 4-byte type x, x[0] -> byte 0
+static unsigned littleEndianByteAt(const unsigned ByteWidth, const unsigned I) {
+  assert(I < ByteWidth && "I must be in [0, ByteWidth)");
+  return I;
+}
+
+/// \returns The big endian in-memory byte position of byte \p I in a
+/// \p ByteWidth bytes wide type.
+///
+/// E.g. Given a 4-byte type x, x[0] -> byte 3
+static unsigned bigEndianByteAt(const unsigned ByteWidth, const unsigned I) {
+  assert(I < ByteWidth && "I must be in [0, ByteWidth)");
+  return ByteWidth - I - 1;
+}
+
+/// Given a map from byte offsets in memory to indices in a load/store,
+/// determine if that map corresponds to a little or big endian byte pattern.
+///
+/// \param MemOffset2Idx maps memory offsets to address offsets.
+/// \param LowestIdx is the lowest index in \p MemOffset2Idx.
+///
+/// \returns true if the map corresponds to a big endian byte pattern, false
+/// if it corresponds to a little endian byte pattern, and None otherwise.
+///
+/// E.g. given a 32-bit type x, and x[AddrOffset], the in-memory byte patterns
+/// are as follows:
+///
+/// AddrOffset   Little endian    Big endian
+/// 0            0                3
+/// 1            1                2
+/// 2            2                1
+/// 3            3                0
+static Optional<bool>
+isBigEndian(const SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
+            int64_t LowestIdx) {
+  // Need at least two byte positions to decide on endianness.
+  unsigned Width = MemOffset2Idx.size();
+  if (Width < 2)
+    return None;
+  bool BigEndian = true, LittleEndian = true;
+  for (unsigned MemOffset = 0; MemOffset < Width; ++ MemOffset) {
+    auto MemOffsetAndIdx = MemOffset2Idx.find(MemOffset);
+    if (MemOffsetAndIdx == MemOffset2Idx.end())
+      return None;
+    const int64_t Idx = MemOffsetAndIdx->second - LowestIdx;
+    assert(Idx >= 0 && "Expected non-negative byte offset?");
+    LittleEndian &= Idx == littleEndianByteAt(Width, MemOffset);
+    BigEndian &= Idx == bigEndianByteAt(Width, MemOffset);
+    if (!BigEndian && !LittleEndian)
+      return None;
+  }
+
+  assert((BigEndian != LittleEndian) &&
+         "Pattern cannot be both big and little endian!");
+  return BigEndian;
+}
+
 bool CombinerHelper::isLegalOrBeforeLegalizer(
     const LegalityQuery &Query) const {
   return !LI || LI->getAction(Query).Action == LegalizeActions::Legal;
@@ -564,13 +624,16 @@ bool CombinerHelper::isPredecessor(const MachineInstr &DefMI,
   assert(DefMI.getParent() == UseMI.getParent());
   if (&DefMI == &UseMI)
     return false;
-
-  // Loop through the basic block until we find one of the instructions.
-  MachineBasicBlock::const_iterator I = DefMI.getParent()->begin();
-  for (; &*I != &DefMI && &*I != &UseMI; ++I)
-    return &*I == &DefMI;
-
-  llvm_unreachable("Block must contain instructions");
+  const MachineBasicBlock &MBB = *DefMI.getParent();
+  auto NonDbgInsts =
+      instructionsWithoutDebug(MBB.instr_begin(), MBB.instr_end());
+  auto DefOrUse =
+      find_if(NonDbgInsts, [&DefMI, &UseMI](const MachineInstr &MI) {
+        return &MI == &DefMI || &MI == &UseMI;
+      });
+  if (DefOrUse == NonDbgInsts.end())
+    llvm_unreachable("Block must contain both DefMI and UseMI!");
+  return &*DefOrUse == &DefMI;
 }
 
 bool CombinerHelper::dominates(const MachineInstr &DefMI,
@@ -3152,6 +3215,361 @@ bool CombinerHelper::applySimplifyURemByPow2(MachineInstr &MI) {
   return true;
 }
 
+Optional<SmallVector<Register, 8>>
+CombinerHelper::findCandidatesForLoadOrCombine(const MachineInstr *Root) const {
+  assert(Root->getOpcode() == TargetOpcode::G_OR && "Expected G_OR only!");
+  // We want to detect if Root is part of a tree which represents a bunch
+  // of loads being merged into a larger load. We'll try to recognize patterns
+  // like, for example:
+  //
+  //  Reg   Reg
+  //   \    /
+  //    OR_1   Reg
+  //     \    /
+  //      OR_2
+  //        \     Reg
+  //         .. /
+  //        Root
+  //
+  //  Reg   Reg   Reg   Reg
+  //     \ /       \   /
+  //     OR_1      OR_2
+  //       \       /
+  //        \    /
+  //         ...
+  //         Root
+  //
+  // Each "Reg" may have been produced by a load + some arithmetic. This
+  // function will save each of them.
+  SmallVector<Register, 8> RegsToVisit;
+  SmallVector<const MachineInstr *, 7> Ors = {Root};
+
+  // In the "worst" case, we're dealing with a load for each byte. So, there
+  // are at most #bytes - 1 ORs.
+  const unsigned MaxIter =
+      MRI.getType(Root->getOperand(0).getReg()).getSizeInBytes() - 1;
+  for (unsigned Iter = 0; Iter < MaxIter; ++Iter) {
+    if (Ors.empty())
+      break;
+    const MachineInstr *Curr = Ors.pop_back_val();
+    Register OrLHS = Curr->getOperand(1).getReg();
+    Register OrRHS = Curr->getOperand(2).getReg();
+
+    // In the combine, we want to elimate the entire tree.
+    if (!MRI.hasOneNonDBGUse(OrLHS) || !MRI.hasOneNonDBGUse(OrRHS))
+      return None;
+
+    // If it's a G_OR, save it and continue to walk. If it's not, then it's
+    // something that may be a load + arithmetic.
+    if (const MachineInstr *Or = getOpcodeDef(TargetOpcode::G_OR, OrLHS, MRI))
+      Ors.push_back(Or);
+    else
+      RegsToVisit.push_back(OrLHS);
+    if (const MachineInstr *Or = getOpcodeDef(TargetOpcode::G_OR, OrRHS, MRI))
+      Ors.push_back(Or);
+    else
+      RegsToVisit.push_back(OrRHS);
+  }
+
+  // We're going to try and merge each register into a wider power-of-2 type,
+  // so we ought to have an even number of registers.
+  if (RegsToVisit.empty() || RegsToVisit.size() % 2 != 0)
+    return None;
+  return RegsToVisit;
+}
+
+/// Helper function for findLoadOffsetsForLoadOrCombine.
+///
+/// Check if \p Reg is the result of loading a \p MemSizeInBits wide value,
+/// and then moving that value into a specific byte offset.
+///
+/// e.g. x[i] << 24
+///
+/// \returns The load instruction and the byte offset it is moved into.
+static Optional<std::pair<MachineInstr *, int64_t>>
+matchLoadAndBytePosition(Register Reg, unsigned MemSizeInBits,
+                         const MachineRegisterInfo &MRI) {
+  assert(MRI.hasOneNonDBGUse(Reg) &&
+         "Expected Reg to only have one non-debug use?");
+  Register MaybeLoad;
+  int64_t Shift;
+  if (!mi_match(Reg, MRI,
+                m_OneNonDBGUse(m_GShl(m_Reg(MaybeLoad), m_ICst(Shift))))) {
+    Shift = 0;
+    MaybeLoad = Reg;
+  }
+
+  if (Shift % MemSizeInBits != 0)
+    return None;
+
+  // TODO: Handle other types of loads.
+  auto *Load = getOpcodeDef(TargetOpcode::G_ZEXTLOAD, MaybeLoad, MRI);
+  if (!Load)
+    return None;
+
+  const auto &MMO = **Load->memoperands_begin();
+  if (!MMO.isUnordered() || MMO.getSizeInBits() != MemSizeInBits)
+    return None;
+
+  return std::make_pair(Load, Shift / MemSizeInBits);
+}
+
+Optional<std::pair<MachineInstr *, int64_t>>
+CombinerHelper::findLoadOffsetsForLoadOrCombine(
+    SmallDenseMap<int64_t, int64_t, 8> &MemOffset2Idx,
+    const SmallVector<Register, 8> &RegsToVisit, const unsigned MemSizeInBits) {
+
+  // Each load found for the pattern. There should be one for each RegsToVisit.
+  SmallSetVector<const MachineInstr *, 8> Loads;
+
+  // The lowest index used in any load. (The lowest "i" for each x[i].)
+  int64_t LowestIdx = INT64_MAX;
+
+  // The load which uses the lowest index.
+  MachineInstr *LowestIdxLoad = nullptr;
+
+  // Keeps track of the load indices we see. We shouldn't see any indices twice.
+  SmallSet<int64_t, 8> SeenIdx;
+
+  // Ensure each load is in the same MBB.
+  // TODO: Support multiple MachineBasicBlocks.
+  MachineBasicBlock *MBB = nullptr;
+  const MachineMemOperand *MMO = nullptr;
+
+  // Earliest instruction-order load in the pattern.
+  MachineInstr *EarliestLoad = nullptr;
+
+  // Latest instruction-order load in the pattern.
+  MachineInstr *LatestLoad = nullptr;
+
+  // Base pointer which every load should share.
+  Register BasePtr;
+
+  // We want to find a load for each register. Each load should have some
+  // appropriate bit twiddling arithmetic. During this loop, we will also keep
+  // track of the load which uses the lowest index. Later, we will check if we
+  // can use its pointer in the final, combined load.
+  for (auto Reg : RegsToVisit) {
+    // Find the load, and find the position that it will end up in (e.g. a
+    // shifted) value.
+    auto LoadAndPos = matchLoadAndBytePosition(Reg, MemSizeInBits, MRI);
+    if (!LoadAndPos)
+      return None;
+    MachineInstr *Load;
+    int64_t DstPos;
+    std::tie(Load, DstPos) = *LoadAndPos;
+
+    // TODO: Handle multiple MachineBasicBlocks. Currently not handled because
+    // it is 
diff icult to check for stores/calls/etc between loads.
+    MachineBasicBlock *LoadMBB = Load->getParent();
+    if (!MBB)
+      MBB = LoadMBB;
+    if (LoadMBB != MBB)
+      return None;
+
+    // Make sure that the MachineMemOperands of every seen load are compatible.
+    const MachineMemOperand *LoadMMO = *Load->memoperands_begin();
+    if (!MMO)
+      MMO = LoadMMO;
+    if (MMO->getAddrSpace() != LoadMMO->getAddrSpace())
+      return None;
+
+    // Find out what the base pointer and index for the load is.
+    Register LoadPtr;
+    int64_t Idx;
+    if (!mi_match(Load->getOperand(1).getReg(), MRI,
+                  m_GPtrAdd(m_Reg(LoadPtr), m_ICst(Idx)))) {
+      LoadPtr = Load->getOperand(1).getReg();
+      Idx = 0;
+    }
+
+    // Don't combine things like a[i], a[i] -> a bigger load.
+    if (!SeenIdx.insert(Idx).second)
+      return None;
+
+    // Every load must share the same base pointer; don't combine things like:
+    //
+    // a[i], b[i + 1] -> a bigger load.
+    if (!BasePtr.isValid())
+      BasePtr = LoadPtr;
+    if (BasePtr != LoadPtr)
+      return None;
+
+    if (Idx < LowestIdx) {
+      LowestIdx = Idx;
+      LowestIdxLoad = Load;
+    }
+
+    // Keep track of the byte offset that this load ends up at. If we have seen
+    // the byte offset, then stop here. We do not want to combine:
+    //
+    // a[i] << 16, a[i + k] << 16 -> a bigger load.
+    if (!MemOffset2Idx.try_emplace(DstPos, Idx).second)
+      return None;
+    Loads.insert(Load);
+
+    // Keep track of the position of the earliest/latest loads in the pattern.
+    // We will check that there are no load fold barriers between them later
+    // on.
+    //
+    // FIXME: Is there a better way to check for load fold barriers?
+    if (!EarliestLoad || dominates(*Load, *EarliestLoad))
+      EarliestLoad = Load;
+    if (!LatestLoad || dominates(*LatestLoad, *Load))
+      LatestLoad = Load;
+  }
+
+  // We found a load for each register. Let's check if each load satisfies the
+  // pattern.
+  assert(Loads.size() == RegsToVisit.size() &&
+         "Expected to find a load for each register?");
+  assert(EarliestLoad != LatestLoad && EarliestLoad &&
+         LatestLoad && "Expected at least two loads?");
+
+  // Check if there are any stores, calls, etc. between any of the loads. If
+  // there are, then we can't safely perform the combine.
+  //
+  // MaxIter is chosen based off the (worst case) number of iterations it
+  // typically takes to succeed in the LLVM test suite plus some padding.
+  //
+  // FIXME: Is there a better way to check for load fold barriers?
+  const unsigned MaxIter = 20;
+  unsigned Iter = 0;
+  for (const auto &MI : instructionsWithoutDebug(EarliestLoad->getIterator(),
+                                                 LatestLoad->getIterator())) {
+    if (Loads.count(&MI))
+      continue;
+    if (MI.isLoadFoldBarrier())
+      return None;
+    if (Iter++ == MaxIter)
+      return None;
+  }
+
+  return std::make_pair(LowestIdxLoad, LowestIdx);
+}
+
+bool CombinerHelper::matchLoadOrCombine(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_OR);
+  MachineFunction &MF = *MI.getMF();
+  // Assuming a little-endian target, transform:
+  //  s8 *a = ...
+  //  s32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
+  // =>
+  //  s32 val = *((i32)a)
+  //
+  //  s8 *a = ...
+  //  s32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]
+  // =>
+  //  s32 val = BSWAP(*((s32)a))
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+  if (Ty.isVector())
+    return false;
+
+  // We need to combine at least two loads into this type. Since the smallest
+  // possible load is into a byte, we need at least a 16-bit wide type.
+  const unsigned WideMemSizeInBits = Ty.getSizeInBits();
+  if (WideMemSizeInBits < 16 || WideMemSizeInBits % 8 != 0)
+    return false;
+
+  // Match a collection of non-OR instructions in the pattern.
+  auto RegsToVisit = findCandidatesForLoadOrCombine(&MI);
+  if (!RegsToVisit)
+    return false;
+
+  // We have a collection of non-OR instructions. Figure out how wide each of
+  // the small loads should be based off of the number of potential loads we
+  // found.
+  const unsigned NarrowMemSizeInBits = WideMemSizeInBits / RegsToVisit->size();
+  if (NarrowMemSizeInBits % 8 != 0)
+    return false;
+
+  // Check if each register feeding into each OR is a load from the same
+  // base pointer + some arithmetic.
+  //
+  // e.g. a[0], a[1] << 8, a[2] << 16, etc.
+  //
+  // Also verify that each of these ends up putting a[i] into the same memory
+  // offset as a load into a wide type would.
+  SmallDenseMap<int64_t, int64_t, 8> MemOffset2Idx;
+  MachineInstr *LowestIdxLoad;
+  int64_t LowestIdx;
+  auto MaybeLoadInfo = findLoadOffsetsForLoadOrCombine(
+      MemOffset2Idx, *RegsToVisit, NarrowMemSizeInBits);
+  if (!MaybeLoadInfo)
+    return false;
+  std::tie(LowestIdxLoad, LowestIdx) = *MaybeLoadInfo;
+
+  // We have a bunch of loads being OR'd together. Using the addresses + offsets
+  // we found before, check if this corresponds to a big or little endian byte
+  // pattern. If it does, then we can represent it using a load + possibly a
+  // BSWAP.
+  bool IsBigEndianTarget = MF.getDataLayout().isBigEndian();
+  Optional<bool> IsBigEndian = isBigEndian(MemOffset2Idx, LowestIdx);
+  if (!IsBigEndian.hasValue())
+    return false;
+  bool NeedsBSwap = IsBigEndianTarget != *IsBigEndian;
+  if (NeedsBSwap && !isLegalOrBeforeLegalizer({TargetOpcode::G_BSWAP, {Ty}}))
+    return false;
+
+  // Make sure that the load from the lowest index produces offset 0 in the
+  // final value.
+  //
+  // This ensures that we won't combine something like this:
+  //
+  // load x[i] -> byte 2
+  // load x[i+1] -> byte 0 ---> wide_load x[i]
+  // load x[i+2] -> byte 1
+  const unsigned NumLoadsInTy = WideMemSizeInBits / NarrowMemSizeInBits;
+  const unsigned ZeroByteOffset =
+      *IsBigEndian
+          ? bigEndianByteAt(NumLoadsInTy, 0)
+          : littleEndianByteAt(NumLoadsInTy, 0);
+  auto ZeroOffsetIdx = MemOffset2Idx.find(ZeroByteOffset);
+  if (ZeroOffsetIdx == MemOffset2Idx.end() ||
+      ZeroOffsetIdx->second != LowestIdx)
+    return false;
+
+  // We wil reuse the pointer from the load which ends up at byte offset 0. It
+  // may not use index 0.
+  Register Ptr = LowestIdxLoad->getOperand(1).getReg();
+  const MachineMemOperand &MMO = **LowestIdxLoad->memoperands_begin();
+  LegalityQuery::MemDesc MMDesc;
+  MMDesc.SizeInBits = WideMemSizeInBits;
+  MMDesc.AlignInBits = MMO.getAlign().value() * 8;
+  MMDesc.Ordering = MMO.getOrdering();
+  if (!isLegalOrBeforeLegalizer(
+          {TargetOpcode::G_LOAD, {Ty, MRI.getType(Ptr)}, {MMDesc}}))
+    return false;
+  auto PtrInfo = MMO.getPointerInfo();
+  auto *NewMMO = MF.getMachineMemOperand(&MMO, PtrInfo, WideMemSizeInBits / 8);
+
+  // Load must be allowed and fast on the target.
+  LLVMContext &C = MF.getFunction().getContext();
+  auto &DL = MF.getDataLayout();
+  bool Fast = false;
+  if (!getTargetLowering().allowsMemoryAccess(C, DL, Ty, *NewMMO, &Fast) ||
+      !Fast)
+    return false;
+
+  MatchInfo = [=](MachineIRBuilder &MIB) {
+    Register LoadDst = NeedsBSwap ? MRI.cloneVirtualRegister(Dst) : Dst;
+    MIB.buildLoad(LoadDst, Ptr, *NewMMO);
+    if (NeedsBSwap)
+      MIB.buildBSwap(Dst, LoadDst);
+  };
+  return true;
+}
+
+bool CombinerHelper::applyLoadOrCombine(
+    MachineInstr &MI, std::function<void(MachineIRBuilder &)> &MatchInfo) {
+  Builder.setInstrAndDebugLoc(MI);
+  MatchInfo(Builder);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
   if (tryCombineCopy(MI))
     return true;

diff  --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 6cacb10ab79f..f639f7295b57 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -1756,6 +1756,14 @@ bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
                             MMO.getFlags(), Fast);
 }
 
+bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
+                                            const DataLayout &DL, LLT Ty,
+                                            const MachineMemOperand &MMO,
+                                            bool *Fast) const {
+  return allowsMemoryAccess(Context, DL, getMVTForLLT(Ty), MMO.getAddrSpace(),
+                            MMO.getAlign(), MMO.getFlags(), Fast);
+}
+
 BranchProbability TargetLoweringBase::getPredictableBranchThreshold() const {
   return BranchProbability(MinPercentageForPredictableBranch, 100);
 }

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern-align.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern-align.mir
new file mode 100644
index 000000000000..993a6713aaaa
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern-align.mir
@@ -0,0 +1,79 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -debugify-and-strip-all-safe -mtriple aarch64 -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="load_or_combine" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=NOT_STRICT
+# RUN: llc -debugify-and-strip-all-safe -mattr=+strict-align -mtriple aarch64 -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="load_or_combine" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=STRICT
+
+# REQUIRES: asserts
+
+# Check that the load-or combine respects alignment requirements.
+...
+---
+name:            misaligned
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; NOT_STRICT-LABEL: name: misaligned
+    ; NOT_STRICT: liveins: $x0, $x1
+    ; NOT_STRICT: %ptr:_(p0) = COPY $x1
+    ; NOT_STRICT: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2)
+    ; NOT_STRICT: $w1 = COPY %full_load(s32)
+    ; NOT_STRICT: RET_ReallyLR implicit $w1
+    ; STRICT-LABEL: name: misaligned
+    ; STRICT: liveins: $x0, $x1
+    ; STRICT: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; STRICT: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; STRICT: %ptr:_(p0) = COPY $x1
+    ; STRICT: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; STRICT: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; STRICT: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    ; STRICT: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+    ; STRICT: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; STRICT: $w1 = COPY %full_load(s32)
+    ; STRICT: RET_ReallyLR implicit $w1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_16:_(s32) = G_CONSTANT i32 16
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+    %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2, align 2)
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, align 2)
+    %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            aligned
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; NOT_STRICT-LABEL: name: aligned
+    ; NOT_STRICT: liveins: $x0, $x1
+    ; NOT_STRICT: %ptr:_(p0) = COPY $x1
+    ; NOT_STRICT: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4)
+    ; NOT_STRICT: $w1 = COPY %full_load(s32)
+    ; NOT_STRICT: RET_ReallyLR implicit $w1
+    ; STRICT-LABEL: name: aligned
+    ; STRICT: liveins: $x0, $x1
+    ; STRICT: %ptr:_(p0) = COPY $x1
+    ; STRICT: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4)
+    ; STRICT: $w1 = COPY %full_load(s32)
+    ; STRICT: RET_ReallyLR implicit $w1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_16:_(s32) = G_CONSTANT i32 16
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+    %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2, align 4)
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, align 4)
+    %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1

diff  --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern.mir
new file mode 100644
index 000000000000..bb1f5a0d36ac
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizer-combiner-load-or-pattern.mir
@@ -0,0 +1,1571 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -debugify-and-strip-all-safe -mtriple aarch64 -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="load_or_combine" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=LITTLE
+# RUN: llc -debugify-and-strip-all-safe -mtriple arm64eb -O0 -run-pass=aarch64-prelegalizer-combiner --aarch64prelegalizercombinerhelper-only-enable-rule="load_or_combine" -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=BIG
+
+# REQUIRES: asserts
+
+# Test that we can combine patterns like
+#
+# s8* x = ...
+# s32 y = (x[0] | (x[1] << 8)) | ((x[2] << 16) | (x[3] << 24))
+#
+# Into either a load, or a load with a bswap.
+
+...
+---
+name:            s8_loads_to_s32_little_endian_pat
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; s8* x = ...
+    ; s32 y = (x[0] | (x[1] << 8)) | ((x[2] << 16) | (x[3] << 24))
+    ;
+    ; -> Little endian: Load from x[0]
+    ; -> Big endian: Load from x[0] + BSWAP
+
+    ; LITTLE-LABEL: name: s8_loads_to_s32_little_endian_pat
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1)
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: s8_loads_to_s32_little_endian_pat
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1)
+    ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]]
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s32) = G_CONSTANT i32 1
+    %cst_2:_(s32) = G_CONSTANT i32 2
+    %cst_3:_(s32) = G_CONSTANT i32 3
+
+    %cst_8:_(s32) = G_CONSTANT i32 8
+    %cst_16:_(s32) = G_CONSTANT i32 16
+    %cst_24:_(s32) = G_CONSTANT i32 24
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+    %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+
+    %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+    %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+
+    %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+    %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+    %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32)
+
+    ; Note the shape of the tree:
+    ;
+    ; byte byte byte  byte
+    ;  \   /      \  /
+    ;    OR        OR
+    ;     \       /
+    ;       \    /
+    ;         OR
+
+    %or1:_(s32) = G_OR %byte0, %byte1
+    %or2:_(s32) = G_OR %byte2, %byte3
+    %full_load:_(s32) = G_OR %or1, %or2
+
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            s8_loads_to_s32_big_endian_pat
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; s8* x = ...
+    ; s32 y = (x[0] << 24 | (x[1] << 16)) | ((x[2] << 8) | x[3]))
+    ;
+    ; -> Little endian: Load from x[0] + BSWAP
+    ; -> Big endian: Load from x[0]
+
+    ; LITTLE-LABEL: name: s8_loads_to_s32_big_endian_pat
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1)
+    ; LITTLE: %full_load:_(s32) = G_BSWAP [[LOAD]]
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: s8_loads_to_s32_big_endian_pat
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1)
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s32) = G_CONSTANT i32 1
+    %cst_2:_(s32) = G_CONSTANT i32 2
+    %cst_3:_(s32) = G_CONSTANT i32 3
+
+    %cst_8:_(s32) = G_CONSTANT i32 8
+    %cst_16:_(s32) = G_CONSTANT i32 16
+    %cst_24:_(s32) = G_CONSTANT i32 24
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+    %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+
+    %elt0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+
+    %byte0:_(s32) = nuw G_SHL %elt0, %cst_24(s32)
+    %byte1:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+    %byte2:_(s32) = nuw G_SHL %elt2, %cst_8(s32)
+    %byte3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+
+    %or1:_(s32) = G_OR %byte0, %byte1
+    %or2:_(s32) = G_OR %byte2, %byte3
+    %full_load:_(s32) = G_OR %or1, %or2
+
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            
diff erent_or_pattern
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; Slightly 
diff erent OR tree.
+    ;
+    ; s8* x = ...
+    ; s32 y = (((x[0] | (x[1] << 8)) | (x[2] << 16)) | (x[3] << 24))
+    ;
+    ; -> Little endian: Load from x[0]
+    ; -> Big endian: Load from x[0] + BSWAP
+
+    ; LITTLE-LABEL: name: 
diff erent_or_pattern
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1)
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: 
diff erent_or_pattern
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 1)
+    ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]]
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s32) = G_CONSTANT i32 1
+    %cst_2:_(s32) = G_CONSTANT i32 2
+    %cst_3:_(s32) = G_CONSTANT i32 3
+
+    %cst_8:_(s32) = G_CONSTANT i32 8
+    %cst_16:_(s32) = G_CONSTANT i32 16
+    %cst_24:_(s32) = G_CONSTANT i32 24
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+    %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+
+    %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+    %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+
+    %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+    %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+    %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32)
+
+    ; Note the shape of the tree:
+    ;
+    ; byte   byte
+    ;  \    /
+    ;   OR_1   byte
+    ;    \    /
+    ;     OR_2
+    ;       \
+    ;        ...
+
+    %or1:_(s32) = G_OR %byte0, %byte1
+    %or2:_(s32) = G_OR %or1, %byte2
+    %full_load:_(s32) = G_OR %or2, %byte3
+
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            s16_loads_to_s32_little_endian_pat
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; s16* x = ...
+    ; s32 y = x[0] | (x[1] << 16)
+    ;
+    ; -> Little endian: Load from x[0]
+    ; -> Big endian: Load from x[0] + BSWAP
+
+    ; LITTLE-LABEL: name: s16_loads_to_s32_little_endian_pat
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2)
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: s16_loads_to_s32_little_endian_pat
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2)
+    ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]]
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_16:_(s32) = G_CONSTANT i32 16
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+    %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            s16_loads_to_s32_big_endian_pat
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; s16 *x = ...
+    ; s32 y = x[1] | (x[0] << 16)
+    ;
+    ; -> Little endian: Load from x[0] + BSWAP
+    ; -> Big endian: Load from x[0]
+
+    ; LITTLE-LABEL: name: s16_loads_to_s32_big_endian_pat
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2)
+    ; LITTLE: %full_load:_(s32) = G_BSWAP [[LOAD]]
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: s16_loads_to_s32_big_endian_pat
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2)
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_16:_(s32) = G_CONSTANT i32 16
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+    %elt0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    %high_half:_(s32) = nuw G_SHL %elt0, %cst_16(s32)
+    %low_half:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            s16_loads_to_s64_little_endian_pat
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; s16 *x = ...
+    ; s32 y = (x[0] | (x[1] << 16)) | ((x[2] << 32) | (x[3] << 48))
+    ;
+    ; -> Little endian: Load from x[0]
+    ; -> Big endian: Load from x[0] + BSWAP
+
+    ; LITTLE-LABEL: name: s16_loads_to_s64_little_endian_pat
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %full_load:_(s64) = G_LOAD %ptr(p0) :: (load 8, align 2)
+    ; LITTLE: $x1 = COPY %full_load(s64)
+    ; LITTLE: RET_ReallyLR implicit $x1
+    ; BIG-LABEL: name: s16_loads_to_s64_little_endian_pat
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD %ptr(p0) :: (load 8, align 2)
+    ; BIG: %full_load:_(s64) = G_BSWAP [[LOAD]]
+    ; BIG: $x1 = COPY %full_load(s64)
+    ; BIG: RET_ReallyLR implicit $x1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_2:_(s64) = G_CONSTANT i64 2
+    %cst_3:_(s64) = G_CONSTANT i64 3
+
+    %cst_16:_(s64) = G_CONSTANT i64 16
+    %cst_32:_(s64) = G_CONSTANT i64 32
+    %cst_48:_(s64) = G_CONSTANT i64 48
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s64)
+    %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s64)
+
+    %byte0_byte1:_(s64) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+
+    %elt1:_(s64) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    %elt2:_(s64) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 2)
+    %elt3:_(s64) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 2)
+
+    %byte2_byte3:_(s64) = nuw G_SHL %elt1, %cst_16(s64)
+    %byte4_byte5:_(s64) = nuw G_SHL %elt2, %cst_32(s64)
+    %byte6_byte7:_(s64) = nuw G_SHL %elt3, %cst_48(s64)
+
+    %or1:_(s64) = G_OR %byte0_byte1, %byte2_byte3
+    %or2:_(s64) = G_OR %byte4_byte5, %byte6_byte7
+    %full_load:_(s64) = G_OR %or1, %or2
+
+    $x1 = COPY %full_load(s64)
+    RET_ReallyLR implicit $x1
+
+...
+---
+name:            s16_loads_to_s64_big_endian_pat
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; s16 *x = ...
+    ; s64 y = (x[3] | (x[2] << 16)) | ((x[1] << 32) | (x[0] << 48))
+    ;
+    ; -> Little endian: Load from x[0] + BSWAP
+    ; -> Big endian: Load from x[0]
+
+    ; LITTLE-LABEL: name: s16_loads_to_s64_big_endian_pat
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD %ptr(p0) :: (load 8, align 2)
+    ; LITTLE: %full_load:_(s64) = G_BSWAP [[LOAD]]
+    ; LITTLE: $x1 = COPY %full_load(s64)
+    ; LITTLE: RET_ReallyLR implicit $x1
+    ; BIG-LABEL: name: s16_loads_to_s64_big_endian_pat
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: %full_load:_(s64) = G_LOAD %ptr(p0) :: (load 8, align 2)
+    ; BIG: $x1 = COPY %full_load(s64)
+    ; BIG: RET_ReallyLR implicit $x1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_2:_(s64) = G_CONSTANT i64 2
+    %cst_3:_(s64) = G_CONSTANT i64 3
+
+    %cst_16:_(s64) = G_CONSTANT i64 16
+    %cst_32:_(s64) = G_CONSTANT i64 32
+    %cst_48:_(s64) = G_CONSTANT i64 48
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s64)
+    %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s64)
+
+    %elt0:_(s64) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    %elt1:_(s64) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    %elt2:_(s64) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 2)
+
+    %byte0_byte1:_(s64) = nuw G_SHL %elt0, %cst_48(s64)
+    %byte2_byte3:_(s64) = nuw G_SHL %elt1, %cst_32(s64)
+    %byte4_byte5:_(s64) = nuw G_SHL %elt2, %cst_16(s64)
+    %byte6_byte7:_(s64) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 2)
+
+    %or1:_(s64) = G_OR %byte0_byte1, %byte2_byte3
+    %or2:_(s64) = G_OR %byte4_byte5, %byte6_byte7
+    %full_load:_(s64) = G_OR %or1, %or2
+
+    $x1 = COPY %full_load(s64)
+    RET_ReallyLR implicit $x1
+
+
+...
+---
+name:            nonzero_start_idx_positive_little_endian_pat
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; s8* x = ...
+    ; s32 y = (x[1] | (x[2] << 8)) | ((x[3] << 16) | (x[4] << 24))
+    ;
+    ; -> Little endian: Load from x[1]
+    ; -> Big endian: Load from x[1] + BSWAP
+
+    ; LITTLE-LABEL: name: nonzero_start_idx_positive_little_endian_pat
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %cst_1:_(s32) = G_CONSTANT i32 1
+    ; LITTLE: %ptr:_(p0) = COPY $x0
+    ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    ; LITTLE: %full_load:_(s32) = G_LOAD %ptr_elt_1(p0) :: (load 4, align 1)
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: nonzero_start_idx_positive_little_endian_pat
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %cst_1:_(s32) = G_CONSTANT i32 1
+    ; BIG: %ptr:_(p0) = COPY $x0
+    ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr_elt_1(p0) :: (load 4, align 1)
+    ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]]
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s32) = G_CONSTANT i32 1
+    %cst_2:_(s32) = G_CONSTANT i32 2
+    %cst_3:_(s32) = G_CONSTANT i32 3
+    %cst_4:_(s32) = G_CONSTANT i32 4
+
+    %cst_8:_(s32) = G_CONSTANT i32 8
+    %cst_16:_(s32) = G_CONSTANT i32 16
+    %cst_24:_(s32) = G_CONSTANT i32 24
+
+    %ptr:_(p0) = COPY $x0
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+    %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+    %ptr_elt_4:_(p0) = G_PTR_ADD %ptr, %cst_4(s32)
+
+    %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+    %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+    %elt4:_(s32) = G_ZEXTLOAD %ptr_elt_4(p0) :: (load 1)
+
+    %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    %byte1:_(s32) = nuw G_SHL %elt2, %cst_8(s32)
+    %byte2:_(s32) = nuw G_SHL %elt3, %cst_16(s32)
+    %byte3:_(s32) = nuw G_SHL %elt4, %cst_24(s32)
+
+    %or1:_(s32) = G_OR %byte0, %byte1
+    %or2:_(s32) = G_OR %byte2, %byte3
+    %full_load:_(s32) = G_OR %or1, %or2
+
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            nonzero_start_idx_positive_big_endian_pat
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; s8* x = ...
+    ; s32 y = (x[4] | (x[3] << 8)) | ((x[2] << 16) | (x[1] << 24))
+    ;
+    ; -> Little endian: Load from x[1] + BSWAP
+    ; -> Big endian: Load from x[1]
+
+    ; LITTLE-LABEL: name: nonzero_start_idx_positive_big_endian_pat
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %cst_1:_(s32) = G_CONSTANT i32 1
+    ; LITTLE: %ptr:_(p0) = COPY $x0
+    ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    ; LITTLE: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr_elt_1(p0) :: (load 4, align 1)
+    ; LITTLE: %full_load:_(s32) = G_BSWAP [[LOAD]]
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: nonzero_start_idx_positive_big_endian_pat
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %cst_1:_(s32) = G_CONSTANT i32 1
+    ; BIG: %ptr:_(p0) = COPY $x0
+    ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    ; BIG: %full_load:_(s32) = G_LOAD %ptr_elt_1(p0) :: (load 4, align 1)
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s32) = G_CONSTANT i32 1
+    %cst_2:_(s32) = G_CONSTANT i32 2
+    %cst_3:_(s32) = G_CONSTANT i32 3
+    %cst_4:_(s32) = G_CONSTANT i32 4
+
+    %cst_8:_(s32) = G_CONSTANT i32 8
+    %cst_16:_(s32) = G_CONSTANT i32 16
+    %cst_24:_(s32) = G_CONSTANT i32 24
+
+    %ptr:_(p0) = COPY $x0
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+    %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+    %ptr_elt_4:_(p0) = G_PTR_ADD %ptr, %cst_4(s32)
+
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+    %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+
+    %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_4(p0) :: (load 1)
+    %byte1:_(s32) = nuw G_SHL %elt3, %cst_8(s32)
+    %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+    %byte3:_(s32) = nuw G_SHL %elt1, %cst_24(s32)
+
+    %or1:_(s32) = G_OR %byte0, %byte1
+    %or2:_(s32) = G_OR %byte2, %byte3
+    %full_load:_(s32) = G_OR %or1, %or2
+
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            nonzero_start_idx_negative_little_endian_pat
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; s8* x = ...
+    ; s32 y = (x[-3] | (x[-2] << 8)) | ((x[-1] << 16) | (x[0] << 24))
+    ;
+    ; -> Little endian: Load from x[-3]
+    ; -> Big endian: Load from x[-3] + BSWAP
+
+    ; LITTLE-LABEL: name: nonzero_start_idx_negative_little_endian_pat
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %cst_neg_3:_(s32) = G_CONSTANT i32 -3
+    ; LITTLE: %ptr:_(p0) = COPY $x0
+    ; LITTLE: %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32)
+    ; LITTLE: %full_load:_(s32) = G_LOAD %ptr_elt_neg_3(p0) :: (load 4, align 1)
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: nonzero_start_idx_negative_little_endian_pat
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %cst_neg_3:_(s32) = G_CONSTANT i32 -3
+    ; BIG: %ptr:_(p0) = COPY $x0
+    ; BIG: %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32)
+    ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr_elt_neg_3(p0) :: (load 4, align 1)
+    ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]]
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_neg_1:_(s32) = G_CONSTANT i32 -1
+    %cst_neg_2:_(s32) = G_CONSTANT i32 -2
+    %cst_neg_3:_(s32) = G_CONSTANT i32 -3
+
+    %cst_8:_(s32) = G_CONSTANT i32 8
+    %cst_16:_(s32) = G_CONSTANT i32 16
+    %cst_24:_(s32) = G_CONSTANT i32 24
+
+    %ptr:_(p0) = COPY $x0
+    %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32)
+    %ptr_elt_neg_2:_(p0) = G_PTR_ADD %ptr, %cst_neg_2(s32)
+    %ptr_elt_neg_1:_(p0) = G_PTR_ADD %ptr, %cst_neg_1(s32)
+
+    %elt_neg_2:_(s32) = G_ZEXTLOAD %ptr_elt_neg_2(p0) :: (load 1)
+    %elt_neg_1:_(s32) = G_ZEXTLOAD %ptr_elt_neg_1(p0) :: (load 1)
+    %elt_0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+
+    %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_neg_3(p0) :: (load 1)
+    %byte1:_(s32) = nuw G_SHL %elt_neg_2, %cst_8(s32)
+    %byte2:_(s32) = nuw G_SHL %elt_neg_1, %cst_16(s32)
+    %byte3:_(s32) = nuw G_SHL %elt_0, %cst_24(s32)
+
+    %or1:_(s32) = G_OR %byte0, %byte1
+    %or2:_(s32) = G_OR %byte2, %byte3
+    %full_load:_(s32) = G_OR %or1, %or2
+
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            nonzero_start_idx_negative_big_endian_pat
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; s8* x = ...
+    ; s32 y = (x[0] | (x[-1] << 8)) | ((x[-2] << 16) | (x[-3] << 24))
+    ;
+    ; -> Little endian: Load from x[-3] + BSWAP
+    ; -> Big endian: Load from x[-3]
+
+    ; LITTLE-LABEL: name: nonzero_start_idx_negative_big_endian_pat
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %cst_neg_3:_(s32) = G_CONSTANT i32 -3
+    ; LITTLE: %ptr:_(p0) = COPY $x0
+    ; LITTLE: %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32)
+    ; LITTLE: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr_elt_neg_3(p0) :: (load 4, align 1)
+    ; LITTLE: %full_load:_(s32) = G_BSWAP [[LOAD]]
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: nonzero_start_idx_negative_big_endian_pat
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %cst_neg_3:_(s32) = G_CONSTANT i32 -3
+    ; BIG: %ptr:_(p0) = COPY $x0
+    ; BIG: %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32)
+    ; BIG: %full_load:_(s32) = G_LOAD %ptr_elt_neg_3(p0) :: (load 4, align 1)
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_neg_1:_(s32) = G_CONSTANT i32 -1
+    %cst_neg_2:_(s32) = G_CONSTANT i32 -2
+    %cst_neg_3:_(s32) = G_CONSTANT i32 -3
+
+    %cst_8:_(s32) = G_CONSTANT i32 8
+    %cst_16:_(s32) = G_CONSTANT i32 16
+    %cst_24:_(s32) = G_CONSTANT i32 24
+
+    %ptr:_(p0) = COPY $x0
+    %ptr_elt_neg_3:_(p0) = G_PTR_ADD %ptr, %cst_neg_3(s32)
+    %ptr_elt_neg_2:_(p0) = G_PTR_ADD %ptr, %cst_neg_2(s32)
+    %ptr_elt_neg_1:_(p0) = G_PTR_ADD %ptr, %cst_neg_1(s32)
+
+    %elt_neg_3:_(s32) = G_ZEXTLOAD %ptr_elt_neg_3(p0) :: (load 1)
+    %elt_neg_2:_(s32) = G_ZEXTLOAD %ptr_elt_neg_2(p0) :: (load 1)
+    %elt_neg_1:_(s32) = G_ZEXTLOAD %ptr_elt_neg_1(p0) :: (load 1)
+    %elt_0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+
+    %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+    %byte1:_(s32) = nuw G_SHL %elt_neg_1, %cst_8(s32)
+    %byte2:_(s32) = nuw G_SHL %elt_neg_2, %cst_16(s32)
+    %byte3:_(s32) = nuw G_SHL %elt_neg_3, %cst_24(s32)
+
+    %or1:_(s32) = G_OR %byte0, %byte1
+    %or2:_(s32) = G_OR %byte2, %byte3
+    %full_load:_(s32) = G_OR %or1, %or2
+
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            dont_combine_volatile
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; Combine should only happen with unordered loads.
+
+    ; LITTLE-LABEL: name: dont_combine_volatile
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (volatile load 2)
+    ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+    ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: dont_combine_volatile
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (volatile load 2)
+    ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+    ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_16:_(s32) = G_CONSTANT i32 16
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+    %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (volatile load 2)
+    %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            dont_wrong_memop_size
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; Combine should only happen when the loads load the same size.
+
+    ; LITTLE-LABEL: name: dont_wrong_memop_size
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; LITTLE: %wrong_size_load:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    ; LITTLE: %high_half:_(s32) = nuw G_SHL %wrong_size_load, %cst_16(s32)
+    ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: dont_wrong_memop_size
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; BIG: %wrong_size_load:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    ; BIG: %high_half:_(s32) = nuw G_SHL %wrong_size_load, %cst_16(s32)
+    ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_16:_(s32) = G_CONSTANT i32 16
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+    %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    %wrong_size_load:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    %high_half:_(s32) = nuw G_SHL %wrong_size_load, %cst_16(s32)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            dont_combine_wrong_offset
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; This is not equivalent to a 32-bit load with/without a BSWAP:
+    ;
+    ; s16 *x = ...
+    ; s32 y = x[0] | (x[1] << 24)
+
+    ; LITTLE-LABEL: name: dont_combine_wrong_offset
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; LITTLE: %cst_24:_(s32) = G_CONSTANT i32 24
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_24(s32)
+    ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: dont_combine_wrong_offset
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; BIG: %cst_24:_(s32) = G_CONSTANT i32 24
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_24(s32)
+    ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_24:_(s32) = G_CONSTANT i32 24
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+    %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    %high_half:_(s32) = nuw G_SHL %elt1, %cst_24(s32)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            dont_combine_wrong_offset_2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; This does not correspond to a 32-bit load with/without a BSWAP:
+    ;
+    ; s16 *x = ...
+    ; s32 y = x[0] | (x[1] << 8)
+
+    ; LITTLE-LABEL: name: dont_combine_wrong_offset_2
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; LITTLE: %cst_8:_(s32) = G_CONSTANT i32 8
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+    ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: dont_combine_wrong_offset_2
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; BIG: %cst_8:_(s32) = G_CONSTANT i32 8
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+    ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_8:_(s32) = G_CONSTANT i32 8
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+    %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    %high_half:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            dont_combine_missing_load
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; This is missing x[2], so we shouldn't combine:
+    ;
+    ; s16 *x = ...
+    ; s64 y = (x[0] | (x[1] << 16)) | (x[3] << 48)
+
+    ; LITTLE-LABEL: name: dont_combine_missing_load
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; LITTLE: %cst_3:_(s64) = G_CONSTANT i64 3
+    ; LITTLE: %cst_16:_(s64) = G_CONSTANT i64 16
+    ; LITTLE: %cst_48:_(s64) = G_CONSTANT i64 48
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; LITTLE: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s64)
+    ; LITTLE: %byte0_byte1:_(s64) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; LITTLE: %elt1:_(s64) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    ; LITTLE: %elt3:_(s64) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 2)
+    ; LITTLE: %byte2_byte3:_(s64) = nuw G_SHL %elt1, %cst_16(s64)
+    ; LITTLE: %byte6_byte7:_(s64) = nuw G_SHL %elt3, %cst_48(s64)
+    ; LITTLE: %or1:_(s64) = G_OR %byte0_byte1, %byte2_byte3
+    ; LITTLE: %full_load:_(s64) = G_OR %or1, %byte6_byte7
+    ; LITTLE: $x1 = COPY %full_load(s64)
+    ; LITTLE: RET_ReallyLR implicit $x1
+    ; BIG-LABEL: name: dont_combine_missing_load
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; BIG: %cst_3:_(s64) = G_CONSTANT i64 3
+    ; BIG: %cst_16:_(s64) = G_CONSTANT i64 16
+    ; BIG: %cst_48:_(s64) = G_CONSTANT i64 48
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; BIG: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s64)
+    ; BIG: %byte0_byte1:_(s64) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; BIG: %elt1:_(s64) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    ; BIG: %elt3:_(s64) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 2)
+    ; BIG: %byte2_byte3:_(s64) = nuw G_SHL %elt1, %cst_16(s64)
+    ; BIG: %byte6_byte7:_(s64) = nuw G_SHL %elt3, %cst_48(s64)
+    ; BIG: %or1:_(s64) = G_OR %byte0_byte1, %byte2_byte3
+    ; BIG: %full_load:_(s64) = G_OR %or1, %byte6_byte7
+    ; BIG: $x1 = COPY %full_load(s64)
+    ; BIG: RET_ReallyLR implicit $x1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_3:_(s64) = G_CONSTANT i64 3
+
+    %cst_16:_(s64) = G_CONSTANT i64 16
+    %cst_48:_(s64) = G_CONSTANT i64 48
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s64)
+
+    %byte0_byte1:_(s64) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+
+    %elt1:_(s64) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    %elt3:_(s64) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 2)
+
+    %byte2_byte3:_(s64) = nuw G_SHL %elt1, %cst_16(s64)
+    %byte6_byte7:_(s64) = nuw G_SHL %elt3, %cst_48(s64)
+
+    %or1:_(s64) = G_OR %byte0_byte1, %byte2_byte3
+    %full_load:_(s64) = G_OR %or1, %byte6_byte7
+
+    $x1 = COPY %full_load(s64)
+    RET_ReallyLR implicit $x1
+
+...
+---
+name:            dont_combine_
diff erent_addr_spaces
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; When the loads are from 
diff erent address spaces, don't combine.
+
+    ; LITTLE-LABEL: name: dont_combine_
diff erent_addr_spaces
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, addrspace 1)
+    ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+    ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: dont_combine_
diff erent_addr_spaces
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, addrspace 1)
+    ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+    ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_16:_(s32) = G_CONSTANT i32 16
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+    %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2, addrspace 0)
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2, addrspace 1)
+    %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            dont_combine_duplicate_idx
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; If two of the G_PTR_ADDs have the same index, then don't combine.
+    ;
+    ; sN *x = ...
+    ; sM y = (x[i] << A) | (x[i] << B) ...
+
+    ; LITTLE-LABEL: name: dont_combine_duplicate_idx
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %cst_1:_(s32) = G_CONSTANT i32 1
+    ; LITTLE: %reused_idx:_(s32) = G_CONSTANT i32 2
+    ; LITTLE: %cst_8:_(s32) = G_CONSTANT i32 8
+    ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; LITTLE: %cst_24:_(s32) = G_CONSTANT i32 24
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    ; LITTLE: %uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32)
+    ; LITTLE: %also_uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32)
+    ; LITTLE: %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+    ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    ; LITTLE: %elt2:_(s32) = G_ZEXTLOAD %uses_idx_2(p0) :: (load 1)
+    ; LITTLE: %elt3:_(s32) = G_ZEXTLOAD %also_uses_idx_2(p0) :: (load 1)
+    ; LITTLE: %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+    ; LITTLE: %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+    ; LITTLE: %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32)
+    ; LITTLE: %or1:_(s32) = G_OR %byte0, %byte1
+    ; LITTLE: %or2:_(s32) = G_OR %byte2, %byte3
+    ; LITTLE: %full_load:_(s32) = G_OR %or1, %or2
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: dont_combine_duplicate_idx
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %cst_1:_(s32) = G_CONSTANT i32 1
+    ; BIG: %reused_idx:_(s32) = G_CONSTANT i32 2
+    ; BIG: %cst_8:_(s32) = G_CONSTANT i32 8
+    ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; BIG: %cst_24:_(s32) = G_CONSTANT i32 24
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    ; BIG: %uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32)
+    ; BIG: %also_uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32)
+    ; BIG: %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+    ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    ; BIG: %elt2:_(s32) = G_ZEXTLOAD %uses_idx_2(p0) :: (load 1)
+    ; BIG: %elt3:_(s32) = G_ZEXTLOAD %also_uses_idx_2(p0) :: (load 1)
+    ; BIG: %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+    ; BIG: %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+    ; BIG: %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32)
+    ; BIG: %or1:_(s32) = G_OR %byte0, %byte1
+    ; BIG: %or2:_(s32) = G_OR %byte2, %byte3
+    ; BIG: %full_load:_(s32) = G_OR %or1, %or2
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s32) = G_CONSTANT i32 1
+    %reused_idx:_(s32) = G_CONSTANT i32 2
+
+    %cst_8:_(s32) = G_CONSTANT i32 8
+    %cst_16:_(s32) = G_CONSTANT i32 16
+    %cst_24:_(s32) = G_CONSTANT i32 24
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    %uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32)
+    %also_uses_idx_2:_(p0) = G_PTR_ADD %ptr, %reused_idx(s32)
+
+    %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    %elt2:_(s32) = G_ZEXTLOAD %uses_idx_2(p0) :: (load 1)
+    %elt3:_(s32) = G_ZEXTLOAD %also_uses_idx_2(p0) :: (load 1)
+
+    %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+    %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+    %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32)
+
+    %or1:_(s32) = G_OR %byte0, %byte1
+    %or2:_(s32) = G_OR %byte2, %byte3
+    %full_load:_(s32) = G_OR %or1, %or2
+
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+...
+---
+name:            dont_combine_duplicate_offset
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; If two of the G_SHLs have the same constant, then we should not combine.
+    ;
+    ; sN *x = ...
+    ; sM y = (x[i] << A) | (x[i+1] << A) ...
+
+    ; LITTLE-LABEL: name: dont_combine_duplicate_offset
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %cst_1:_(s32) = G_CONSTANT i32 1
+    ; LITTLE: %cst_2:_(s32) = G_CONSTANT i32 2
+    ; LITTLE: %cst_3:_(s32) = G_CONSTANT i32 3
+    ; LITTLE: %cst_8:_(s32) = G_CONSTANT i32 8
+    ; LITTLE: %duplicate_shl_cst:_(s32) = G_CONSTANT i32 16
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    ; LITTLE: %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+    ; LITTLE: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+    ; LITTLE: %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+    ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    ; LITTLE: %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+    ; LITTLE: %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+    ; LITTLE: %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+    ; LITTLE: %duplicate_shl_1:_(s32) = nuw G_SHL %elt2, %duplicate_shl_cst(s32)
+    ; LITTLE: %duplicate_shl_2:_(s32) = nuw G_SHL %elt3, %duplicate_shl_cst(s32)
+    ; LITTLE: %or1:_(s32) = G_OR %byte0, %byte1
+    ; LITTLE: %or2:_(s32) = G_OR %duplicate_shl_1, %duplicate_shl_2
+    ; LITTLE: %full_load:_(s32) = G_OR %or1, %or2
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: dont_combine_duplicate_offset
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %cst_1:_(s32) = G_CONSTANT i32 1
+    ; BIG: %cst_2:_(s32) = G_CONSTANT i32 2
+    ; BIG: %cst_3:_(s32) = G_CONSTANT i32 3
+    ; BIG: %cst_8:_(s32) = G_CONSTANT i32 8
+    ; BIG: %duplicate_shl_cst:_(s32) = G_CONSTANT i32 16
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    ; BIG: %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+    ; BIG: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+    ; BIG: %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+    ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    ; BIG: %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+    ; BIG: %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+    ; BIG: %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+    ; BIG: %duplicate_shl_1:_(s32) = nuw G_SHL %elt2, %duplicate_shl_cst(s32)
+    ; BIG: %duplicate_shl_2:_(s32) = nuw G_SHL %elt3, %duplicate_shl_cst(s32)
+    ; BIG: %or1:_(s32) = G_OR %byte0, %byte1
+    ; BIG: %or2:_(s32) = G_OR %duplicate_shl_1, %duplicate_shl_2
+    ; BIG: %full_load:_(s32) = G_OR %or1, %or2
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s32) = G_CONSTANT i32 1
+    %cst_2:_(s32) = G_CONSTANT i32 2
+    %cst_3:_(s32) = G_CONSTANT i32 3
+
+    %cst_8:_(s32) = G_CONSTANT i32 8
+    %duplicate_shl_cst:_(s32) = G_CONSTANT i32 16
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+    %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+
+    %byte0:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+    %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+
+    %byte1:_(s32) = nuw G_SHL %elt1, %cst_8(s32)
+    %duplicate_shl_1:_(s32) = nuw G_SHL %elt2, %duplicate_shl_cst(s32)
+    %duplicate_shl_2:_(s32) = nuw G_SHL %elt3, %duplicate_shl_cst(s32)
+
+    %or1:_(s32) = G_OR %byte0, %byte1
+    %or2:_(s32) = G_OR %duplicate_shl_1, %duplicate_shl_2
+    %full_load:_(s32) = G_OR %or1, %or2
+
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            dont_combine_lowest_index_not_zero_offset
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; In this case, the lowest index load (e.g. x[0]) does not end up at byte
+    ; offset 0. We shouldn't combine.
+    ;
+    ; s8 *x = ...
+    ; s32 y = (x[0] << 8) | (x[1]) | (x[2] << 16) ...
+
+    ; LITTLE-LABEL: name: dont_combine_lowest_index_not_zero_offset
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %cst_1:_(s32) = G_CONSTANT i32 1
+    ; LITTLE: %cst_2:_(s32) = G_CONSTANT i32 2
+    ; LITTLE: %cst_3:_(s32) = G_CONSTANT i32 3
+    ; LITTLE: %cst_8:_(s32) = G_CONSTANT i32 8
+    ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; LITTLE: %cst_24:_(s32) = G_CONSTANT i32 24
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    ; LITTLE: %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+    ; LITTLE: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+    ; LITTLE: %lowest_idx_load:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+    ; LITTLE: %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    ; LITTLE: %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+    ; LITTLE: %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+    ; LITTLE: %byte1:_(s32) = nuw G_SHL %lowest_idx_load, %cst_8(s32)
+    ; LITTLE: %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+    ; LITTLE: %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32)
+    ; LITTLE: %or1:_(s32) = G_OR %byte0, %byte1
+    ; LITTLE: %or2:_(s32) = G_OR %byte2, %byte3
+    ; LITTLE: %full_load:_(s32) = G_OR %or1, %or2
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: dont_combine_lowest_index_not_zero_offset
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %cst_1:_(s32) = G_CONSTANT i32 1
+    ; BIG: %cst_2:_(s32) = G_CONSTANT i32 2
+    ; BIG: %cst_3:_(s32) = G_CONSTANT i32 3
+    ; BIG: %cst_8:_(s32) = G_CONSTANT i32 8
+    ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; BIG: %cst_24:_(s32) = G_CONSTANT i32 24
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    ; BIG: %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+    ; BIG: %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+    ; BIG: %lowest_idx_load:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+    ; BIG: %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    ; BIG: %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+    ; BIG: %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+    ; BIG: %byte1:_(s32) = nuw G_SHL %lowest_idx_load, %cst_8(s32)
+    ; BIG: %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+    ; BIG: %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32)
+    ; BIG: %or1:_(s32) = G_OR %byte0, %byte1
+    ; BIG: %or2:_(s32) = G_OR %byte2, %byte3
+    ; BIG: %full_load:_(s32) = G_OR %or1, %or2
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s32) = G_CONSTANT i32 1
+    %cst_2:_(s32) = G_CONSTANT i32 2
+    %cst_3:_(s32) = G_CONSTANT i32 3
+
+    %cst_8:_(s32) = G_CONSTANT i32 8
+    %cst_16:_(s32) = G_CONSTANT i32 16
+    %cst_24:_(s32) = G_CONSTANT i32 24
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s32)
+    %ptr_elt_2:_(p0) = G_PTR_ADD %ptr, %cst_2(s32)
+    %ptr_elt_3:_(p0) = G_PTR_ADD %ptr, %cst_3(s32)
+
+    ; This load is index 0
+    %lowest_idx_load:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 1)
+    %byte0:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 1)
+    %elt2:_(s32) = G_ZEXTLOAD %ptr_elt_2(p0) :: (load 1)
+    %elt3:_(s32) = G_ZEXTLOAD %ptr_elt_3(p0) :: (load 1)
+
+    ; ... But it ends up being shifted, so we shouldn't combine.
+    %byte1:_(s32) = nuw G_SHL %lowest_idx_load, %cst_8(s32)
+    %byte2:_(s32) = nuw G_SHL %elt2, %cst_16(s32)
+    %byte3:_(s32) = nuw G_SHL %elt3, %cst_24(s32)
+
+    %or1:_(s32) = G_OR %byte0, %byte1
+    %or2:_(s32) = G_OR %byte2, %byte3
+    %full_load:_(s32) = G_OR %or1, %or2
+
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            dont_combine_more_than_one_use_load
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; If any load is used more than once, don't combine. We want to remove the
+    ; entire tree.
+
+    ; LITTLE-LABEL: name: dont_combine_more_than_one_use_load
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+    ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; LITTLE: %extra_use:_(s32) = G_AND %full_load, %low_half
+    ; LITTLE: $w1 = COPY %extra_use(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: dont_combine_more_than_one_use_load
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+    ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; BIG: %extra_use:_(s32) = G_AND %full_load, %low_half
+    ; BIG: $w1 = COPY %extra_use(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_16:_(s32) = G_CONSTANT i32 16
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+    %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    %extra_use:_(s32) = G_AND %full_load, %low_half
+    $w1 = COPY %extra_use(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            dont_combine_more_than_one_use_shl
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+
+    ; If anything feeding into any of the ors is used more than once, don't
+    ; combine.
+
+    ; LITTLE-LABEL: name: dont_combine_more_than_one_use_shl
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+    ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; LITTLE: %extra_use:_(s32) = G_AND %full_load, %high_half
+    ; LITTLE: $w1 = COPY %extra_use(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: dont_combine_more_than_one_use_shl
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+    ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; BIG: %extra_use:_(s32) = G_AND %full_load, %high_half
+    ; BIG: $w1 = COPY %extra_use(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_16:_(s32) = G_CONSTANT i32 16
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+    %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    %extra_use:_(s32) = G_AND %full_load, %high_half
+    $w1 = COPY %extra_use(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            dont_combine_store_between_same_mbb
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; If there is a store between any of the loads, then do not combine.
+
+    ; LITTLE-LABEL: name: dont_combine_store_between_same_mbb
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; LITTLE: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; LITTLE: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; LITTLE: %other_ptr:_(p0) = COPY $x1
+    ; LITTLE: %some_val:_(s32) = G_CONSTANT i32 12
+    ; LITTLE: G_STORE %some_val(s32), %other_ptr(p0) :: (store 2)
+    ; LITTLE: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    ; LITTLE: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+    ; LITTLE: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: dont_combine_store_between_same_mbb
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %cst_1:_(s64) = G_CONSTANT i64 1
+    ; BIG: %cst_16:_(s32) = G_CONSTANT i32 16
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    ; BIG: %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    ; BIG: %other_ptr:_(p0) = COPY $x1
+    ; BIG: %some_val:_(s32) = G_CONSTANT i32 12
+    ; BIG: G_STORE %some_val(s32), %other_ptr(p0) :: (store 2)
+    ; BIG: %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    ; BIG: %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+    ; BIG: %full_load:_(s32) = G_OR %low_half, %high_half
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_16:_(s32) = G_CONSTANT i32 16
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+
+    ; Memory could be modified here, so don't combine!
+    %other_ptr:_(p0) = COPY $x1
+    %some_val:_(s32) = G_CONSTANT i32 12
+    G_STORE  %some_val, %other_ptr :: (store 2)
+
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            dont_combine_store_between_
diff erent_mbb
+tracksRegLiveness: true
+body:             |
+  ; There is a store between the two loads, hidden away in a 
diff erent MBB.
+  ; We should not combine here.
+
+  ; LITTLE-LABEL: name: dont_combine_store_between_
diff erent_mbb
+  ; LITTLE: bb.0:
+  ; LITTLE:   successors: %bb.1(0x80000000)
+  ; LITTLE:   liveins: $x0, $x1
+  ; LITTLE:   %cst_1:_(s64) = G_CONSTANT i64 1
+  ; LITTLE:   %cst_16:_(s32) = G_CONSTANT i32 16
+  ; LITTLE:   %ptr:_(p0) = COPY $x1
+  ; LITTLE:   %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+  ; LITTLE:   %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+  ; LITTLE: bb.1:
+  ; LITTLE:   successors: %bb.2(0x80000000)
+  ; LITTLE:   liveins: $x0, $x1
+  ; LITTLE:   %other_ptr:_(p0) = COPY $x1
+  ; LITTLE:   %some_val:_(s32) = G_CONSTANT i32 12
+  ; LITTLE:   G_STORE %some_val(s32), %other_ptr(p0) :: (store 2)
+  ; LITTLE: bb.2:
+  ; LITTLE:   liveins: $x0, $x1
+  ; LITTLE:   %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+  ; LITTLE:   %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+  ; LITTLE:   %full_load:_(s32) = G_OR %low_half, %high_half
+  ; LITTLE:   $w1 = COPY %full_load(s32)
+  ; LITTLE:   RET_ReallyLR implicit $w1
+  ; BIG-LABEL: name: dont_combine_store_between_
diff erent_mbb
+  ; BIG: bb.0:
+  ; BIG:   successors: %bb.1(0x80000000)
+  ; BIG:   liveins: $x0, $x1
+  ; BIG:   %cst_1:_(s64) = G_CONSTANT i64 1
+  ; BIG:   %cst_16:_(s32) = G_CONSTANT i32 16
+  ; BIG:   %ptr:_(p0) = COPY $x1
+  ; BIG:   %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+  ; BIG:   %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+  ; BIG: bb.1:
+  ; BIG:   successors: %bb.2(0x80000000)
+  ; BIG:   liveins: $x0, $x1
+  ; BIG:   %other_ptr:_(p0) = COPY $x1
+  ; BIG:   %some_val:_(s32) = G_CONSTANT i32 12
+  ; BIG:   G_STORE %some_val(s32), %other_ptr(p0) :: (store 2)
+  ; BIG: bb.2:
+  ; BIG:   liveins: $x0, $x1
+  ; BIG:   %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+  ; BIG:   %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+  ; BIG:   %full_load:_(s32) = G_OR %low_half, %high_half
+  ; BIG:   $w1 = COPY %full_load(s32)
+  ; BIG:   RET_ReallyLR implicit $w1
+
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $x0, $x1
+    ; If there is a store between any of the loads, then do not combine.
+
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_16:_(s32) = G_CONSTANT i32 16
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+
+  bb.1:
+   liveins: $x0, $x1
+    successors: %bb.2(0x80000000)
+    ; Memory could be modified here, so don't combine!
+    %other_ptr:_(p0) = COPY $x1
+    %some_val:_(s32) = G_CONSTANT i32 12
+    G_STORE  %some_val, %other_ptr :: (store 2)
+
+  bb.2:
+    liveins: $x0, $x1
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            
diff erent_mbb
+tracksRegLiveness: true
+body:             |
+  ; It should be possible to combine here, but it's not supported right now.
+
+  ; LITTLE-LABEL: name: 
diff erent_mbb
+  ; LITTLE: bb.0:
+  ; LITTLE:   successors: %bb.1(0x80000000)
+  ; LITTLE:   liveins: $x0, $x1
+  ; LITTLE:   %cst_1:_(s64) = G_CONSTANT i64 1
+  ; LITTLE:   %cst_16:_(s32) = G_CONSTANT i32 16
+  ; LITTLE:   %ptr:_(p0) = COPY $x1
+  ; LITTLE:   %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+  ; LITTLE:   %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+  ; LITTLE: bb.1:
+  ; LITTLE:   liveins: $x0, $x1
+  ; LITTLE:   %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+  ; LITTLE:   %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+  ; LITTLE:   %full_load:_(s32) = G_OR %low_half, %high_half
+  ; LITTLE:   $w1 = COPY %full_load(s32)
+  ; LITTLE:   RET_ReallyLR implicit $w1
+  ; BIG-LABEL: name: 
diff erent_mbb
+  ; BIG: bb.0:
+  ; BIG:   successors: %bb.1(0x80000000)
+  ; BIG:   liveins: $x0, $x1
+  ; BIG:   %cst_1:_(s64) = G_CONSTANT i64 1
+  ; BIG:   %cst_16:_(s32) = G_CONSTANT i32 16
+  ; BIG:   %ptr:_(p0) = COPY $x1
+  ; BIG:   %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+  ; BIG:   %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+  ; BIG: bb.1:
+  ; BIG:   liveins: $x0, $x1
+  ; BIG:   %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+  ; BIG:   %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+  ; BIG:   %full_load:_(s32) = G_OR %low_half, %high_half
+  ; BIG:   $w1 = COPY %full_load(s32)
+  ; BIG:   RET_ReallyLR implicit $w1
+
+  bb.0:
+    successors: %bb.1(0x80000000)
+    liveins: $x0, $x1
+
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_16:_(s32) = G_CONSTANT i32 16
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+    %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+
+  bb.1:
+    liveins: $x0, $x1
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1
+
+...
+---
+name:            load_first
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $x0, $x1
+    ; Test for a bug fix for predecessor-checking code.
+
+    ; LITTLE-LABEL: name: load_first
+    ; LITTLE: liveins: $x0, $x1
+    ; LITTLE: %ptr:_(p0) = COPY $x1
+    ; LITTLE: %full_load:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2)
+    ; LITTLE: $w1 = COPY %full_load(s32)
+    ; LITTLE: RET_ReallyLR implicit $w1
+    ; BIG-LABEL: name: load_first
+    ; BIG: liveins: $x0, $x1
+    ; BIG: %ptr:_(p0) = COPY $x1
+    ; BIG: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD %ptr(p0) :: (load 4, align 2)
+    ; BIG: %full_load:_(s32) = G_BSWAP [[LOAD]]
+    ; BIG: $w1 = COPY %full_load(s32)
+    ; BIG: RET_ReallyLR implicit $w1
+    %low_half:_(s32) = G_ZEXTLOAD %ptr(p0) :: (load 2)
+    %cst_1:_(s64) = G_CONSTANT i64 1
+    %cst_16:_(s32) = G_CONSTANT i32 16
+
+    %ptr:_(p0) = COPY $x1
+    %ptr_elt_1:_(p0) = G_PTR_ADD %ptr, %cst_1(s64)
+
+    %elt1:_(s32) = G_ZEXTLOAD %ptr_elt_1(p0) :: (load 2)
+    %high_half:_(s32) = nuw G_SHL %elt1, %cst_16(s32)
+
+    %full_load:_(s32) = G_OR %low_half, %high_half
+    $w1 = COPY %full_load(s32)
+    RET_ReallyLR implicit $w1