[llvm] r289538 - [DAGCombiner] Match load by bytes idiom and fold it into a single load

Artur Pilipenko via llvm-commits llvm-commits at lists.llvm.org
Tue Dec 13 06:21:15 PST 2016


Author: apilipenko
Date: Tue Dec 13 08:21:14 2016
New Revision: 289538

URL: http://llvm.org/viewvc/llvm-project?rev=289538&view=rev
Log:
[DAGCombiner] Match load by bytes idiom and fold it into a single load

Match a pattern where a wide type scalar value is loaded by several narrow loads and combined by shifts and ors. Fold it into a single load or a load and a bswap if the targets supports it.

Assuming little endian target:
  i8 *a = ...
  i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
=>
  i32 val = *((i32)a)

  i8 *a = ...
  i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]
=>
  i32 val = BSWAP(*((i32)a))

This optimization was discussed on llvm-dev some time ago in "Load combine pass" thread. We came to the conclusion that we want to do this transformation late in the pipeline because in presence of atomic loads load widening is irreversible transformation and it might hinder other optimizations.

Eventually we'd like to support folding patterns like this where the offset has a variable and a constant part:
  i32 val = a[i] | (a[i + 1] << 8) | (a[i + 2] << 16) | (a[i + 3] << 24)

Matching the pattern above is easier at SelectionDAG level since address reassociation has already happened and the fact that the loads are adjacent is clear. Understanding that these loads are adjacent at IR level would have involved looking through geps/zexts/adds while looking at the addresses.

The general scheme is to match OR expressions by recursively calculating the origin of individual bits which constitute the resulting OR value. If all the OR bits come from memory verify that they are adjacent and match with little or big endian encoding of a wider value. If so and the load of the wider type (and bswap if needed) is allowed by the target generate a load and a bswap if needed.

Reviewed By: hfinkel, RKSimon, filcab

Differential Revision: https://reviews.llvm.org/D26149

Added:
    llvm/trunk/test/CodeGen/ARM/load-combine-big-endian.ll
    llvm/trunk/test/CodeGen/ARM/load-combine.ll
    llvm/trunk/test/CodeGen/X86/load-combine.ll
Modified:
    llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Modified: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp?rev=289538&r1=289537&r2=289538&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp (original)
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp Tue Dec 13 08:21:14 2016
@@ -20,6 +20,7 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -375,6 +376,7 @@ namespace {
                               unsigned PosOpcode, unsigned NegOpcode,
                               const SDLoc &DL);
     SDNode *MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
+    SDValue MatchLoadCombine(SDNode *N);
     SDValue ReduceLoadWidth(SDNode *N);
     SDValue ReduceLoadOpStoreWidth(SDNode *N);
     SDValue splitMergedValStore(StoreSDNode *ST);
@@ -3969,6 +3971,9 @@ SDValue DAGCombiner::visitOR(SDNode *N)
   if (SDNode *Rot = MatchRotate(N0, N1, SDLoc(N)))
     return SDValue(Rot, 0);
 
+  if (SDValue Load = MatchLoadCombine(N))
+    return Load;
+
   // Simplify the operands using demanded-bits information.
   if (!VT.isVector() &&
       SimplifyDemandedBits(SDValue(N, 0)))
@@ -4340,6 +4345,277 @@ struct BaseIndexOffset {
 };
 } // namespace
 
+namespace {
+/// Represents the origin of an individual byte in load combine pattern. The
+/// value of the byte is either unknown, zero or comes from memory.
+struct ByteProvider {
+  enum ProviderTy {
+    Unknown,
+    ZeroConstant,
+    Memory
+  };
+
+  ProviderTy Kind;
+  // Load and ByteOffset are set for Memory providers only.
+  // Load represents the node which loads the byte from memory.
+  // ByteOffset is the offset of the byte in the value produced by the load.
+  LoadSDNode *Load;
+  unsigned ByteOffset;
+
+  ByteProvider() : Kind(ProviderTy::Unknown), Load(nullptr), ByteOffset(0) {}
+
+  static ByteProvider getUnknown() {
+    return ByteProvider(ProviderTy::Unknown, nullptr, 0);
+  }
+  static ByteProvider getMemory(LoadSDNode *Load, unsigned ByteOffset) {
+    return ByteProvider(ProviderTy::Memory, Load, ByteOffset);
+  }
+  static ByteProvider getZero() {
+    return ByteProvider(ProviderTy::ZeroConstant, nullptr, 0);
+  }
+
+  bool operator==(const ByteProvider &Other) const {
+    return Other.Kind == Kind && Other.Load == Load &&
+           Other.ByteOffset == ByteOffset;
+  }
+
+private:
+  ByteProvider(ProviderTy Kind, LoadSDNode *Load, unsigned ByteOffset)
+      : Kind(Kind), Load(Load), ByteOffset(ByteOffset) {}
+};
+
+/// Recursively traverses the expression collecting the origin of individual
+/// bytes of the given value. For all the values except the root of the
+/// expression verifies that it doesn't have uses outside of the expression.
+const Optional<SmallVector<ByteProvider, 4> >
+collectByteProviders(SDValue Op, bool CheckNumberOfUses = false) {
+  if (CheckNumberOfUses && !Op.hasOneUse())
+    return None;
+
+  unsigned BitWidth = Op.getScalarValueSizeInBits();
+  if (BitWidth % 8 != 0)
+    return None;
+  unsigned ByteWidth = BitWidth / 8;
+
+  switch (Op.getOpcode()) {
+  case ISD::OR: {
+    auto LHS = collectByteProviders(Op->getOperand(0),
+                                    /*CheckNumberOfUses=*/true);
+    auto RHS = collectByteProviders(Op->getOperand(1),
+                                    /*CheckNumberOfUses=*/true);
+    if (!LHS || !RHS)
+      return None;
+
+    auto OR = [](ByteProvider LHS, ByteProvider RHS) {
+      if (LHS == RHS)
+        return LHS;
+      if (LHS.Kind == ByteProvider::Unknown ||
+          RHS.Kind == ByteProvider::Unknown)
+        return ByteProvider::getUnknown();
+      if (LHS.Kind == ByteProvider::Memory && RHS.Kind == ByteProvider::Memory)
+        return ByteProvider::getUnknown();
+
+      if (LHS.Kind == ByteProvider::Memory)
+        return LHS;
+      else
+        return RHS;
+    };
+
+    SmallVector<ByteProvider, 4> Result(ByteWidth);
+    for (unsigned i = 0; i < LHS->size(); i++)
+      Result[i] = OR(LHS.getValue()[i], RHS.getValue()[i]);
+
+    return Result;
+  }
+  case ISD::SHL: {
+    auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+    if (!ShiftOp)
+      return None;
+
+    uint64_t BitShift = ShiftOp->getZExtValue();
+    if (BitShift % 8 != 0)
+      return None;
+    uint64_t ByteShift = BitShift / 8;
+
+    auto Original = collectByteProviders(Op->getOperand(0),
+                                         /*CheckNumberOfUses=*/true);
+    if (!Original)
+      return None;
+
+    SmallVector<ByteProvider, 4> Result;
+    Result.insert(Result.begin(), ByteShift, ByteProvider::getZero());
+    Result.insert(Result.end(), Original->begin(),
+                  std::prev(Original->end(), ByteShift));
+    assert(Result.size() == ByteWidth && "sanity");
+    return Result;
+  }
+  case ISD::ZERO_EXTEND: {
+    auto Original = collectByteProviders(Op->getOperand(0),
+                                         /*CheckNumberOfUses=*/true);
+    if (!Original)
+      return None;
+
+    SmallVector<ByteProvider, 4> Result;
+    unsigned NarrowByteWidth = Original->size();
+    Result.insert(Result.begin(), Original->begin(), Original->end());
+    Result.insert(Result.end(), ByteWidth - NarrowByteWidth,
+                  ByteProvider::getZero());
+    assert(Result.size() == ByteWidth && "sanity");
+    return Result;
+  }
+  case ISD::LOAD: {
+    auto L = cast<LoadSDNode>(Op.getNode());
+    if (L->isVolatile() || L->isIndexed() ||
+        L->getExtensionType() != ISD::NON_EXTLOAD)
+      return None;
+
+    EVT VT = L->getMemoryVT();
+    assert(BitWidth == VT.getSizeInBits() && "sanity");
+
+    SmallVector<ByteProvider, 4> Result(ByteWidth);
+    for (unsigned i = 0; i < ByteWidth; i++)
+      Result[i] = ByteProvider::getMemory(L, i);
+
+    return Result;
+  }
+  }
+
+  return None;
+}
+} // namespace
+
+/// Match a pattern where a wide type scalar value is loaded by several narrow
+/// loads and combined by shifts and ors. Fold it into a single load or a load
+/// and a BSWAP if the targets supports it.
+///
+/// Assuming little endian target:
+///  i8 *a = ...
+///  i32 val = a[0] | (a[1] << 8) | (a[2] << 16) | (a[3] << 24)
+/// =>
+///  i32 val = *((i32)a)
+///
+///  i8 *a = ...
+///  i32 val = (a[0] << 24) | (a[1] << 16) | (a[2] << 8) | a[3]
+/// =>
+///  i32 val = BSWAP(*((i32)a))
+SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
+  assert(N->getOpcode() == ISD::OR &&
+         "Can only match load combining against OR nodes");
+
+  // Handles simple types only
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
+  // There is nothing to do here if the target can't load a value of this type
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegal(ISD::LOAD, VT))
+    return SDValue();
+
+  // Calculate byte providers for the OR we are looking at
+  auto Res = collectByteProviders(SDValue(N, 0));
+  if (!Res)
+    return SDValue();
+  auto &Bytes = Res.getValue();
+  unsigned ByteWidth = Bytes.size();
+  assert(VT.getSizeInBits() == ByteWidth * 8 && "sanity");
+
+  auto LittleEndianByteAt = [](unsigned BW, unsigned i) { return i; };
+  auto BigEndianByteAt = [](unsigned BW, unsigned i) { return BW - i - 1; };
+
+  Optional<BaseIndexOffset> Base;
+  SDValue Chain;
+
+  SmallSet<LoadSDNode *, 8> Loads;
+  LoadSDNode *FirstLoad = nullptr;
+
+  // Check if all the bytes of the OR we are looking at are loaded from the same
+  // base address. Collect bytes offsets from Base address in ByteOffsets.
+  SmallVector<int64_t, 4> ByteOffsets(ByteWidth);
+  for (unsigned i = 0; i < ByteWidth; i++) {
+    // All the bytes must be loaded from memory
+    if (Bytes[i].Kind != ByteProvider::Memory)
+      return SDValue();
+
+    LoadSDNode *L = Bytes[i].Load;
+    assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() &&
+           (L->getExtensionType() == ISD::NON_EXTLOAD) &&
+           "Must be enforced by collectByteProviders");
+    assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");
+
+    // All loads must share the same chain
+    SDValue LChain = L->getChain();
+    if (!Chain)
+      Chain = LChain;
+    if (Chain != LChain)
+      return SDValue();
+
+    // Loads must share the same base address
+    BaseIndexOffset Ptr = BaseIndexOffset::match(L->getBasePtr(), DAG);
+    if (!Base)
+      Base = Ptr;
+    if (!Base->equalBaseIndex(Ptr))
+      return SDValue();
+
+    // Calculate the offset of the current byte from the base address
+    unsigned LoadByteWidth = L->getMemoryVT().getSizeInBits() / 8;
+    int64_t MemoryByteOffset =
+        DAG.getDataLayout().isBigEndian()
+            ? BigEndianByteAt(LoadByteWidth, Bytes[i].ByteOffset)
+            : LittleEndianByteAt(LoadByteWidth, Bytes[i].ByteOffset);
+    int64_t ByteOffsetFromBase = Ptr.Offset + MemoryByteOffset;
+    ByteOffsets[i] = ByteOffsetFromBase;
+
+    // Remember the first byte load
+    if (ByteOffsetFromBase == 0)
+      FirstLoad = L;
+
+    Loads.insert(L);
+  }
+  assert(Base && "must be set");
+
+  // Check if the bytes of the OR we are looking at match with either big or
+  // little endian value load
+  bool BigEndian = true, LittleEndian = true;
+  for (unsigned i = 0; i < ByteWidth; i++) {
+    LittleEndian &= ByteOffsets[i] == LittleEndianByteAt(ByteWidth, i);
+    BigEndian &= ByteOffsets[i] == BigEndianByteAt(ByteWidth, i);
+    if (!BigEndian && !LittleEndian)
+      return SDValue();
+  }
+  assert((BigEndian != LittleEndian) && "should be either or");
+  assert(FirstLoad && "must be set");
+
+  // The node we are looking at matches with the pattern, check if we can
+  // replace it with a single load and bswap if needed.
+
+  // If the load needs byte swap check if the target supports it
+  bool NeedsBswap = DAG.getDataLayout().isBigEndian() != BigEndian;
+  if (NeedsBswap && !TLI.isOperationLegal(ISD::BSWAP, VT))
+    return SDValue();
+
+  // Check that a load of the wide type is both allowed and fast on the target
+  bool Fast = false;
+  bool Allowed = TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+                                        VT, FirstLoad->getAddressSpace(),
+                                        FirstLoad->getAlignment(), &Fast);
+  if (!Allowed || !Fast)
+    return SDValue();
+
+  SDValue NewLoad =
+      DAG.getLoad(VT, SDLoc(N), Chain, FirstLoad->getBasePtr(),
+                  FirstLoad->getPointerInfo(), FirstLoad->getAlignment());
+
+  // Transfer chain users from old loads to the new load.
+  for (LoadSDNode *L : Loads)
+    DAG.ReplaceAllUsesOfValueWith(SDValue(L, 1), SDValue(NewLoad.getNode(), 1));
+
+  if (NeedsBswap)
+    return DAG.getNode(ISD::BSWAP, SDLoc(N), VT, NewLoad);
+  else
+    return NewLoad;
+}
+
 SDValue DAGCombiner::visitXOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);

Added: llvm/trunk/test/CodeGen/ARM/load-combine-big-endian.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/load-combine-big-endian.ll?rev=289538&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/load-combine-big-endian.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/load-combine-big-endian.ll Tue Dec 13 08:21:14 2016
@@ -0,0 +1,234 @@
+; RUN: llc < %s -mtriple=armeb-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=arm64eb-unknown | FileCheck %s --check-prefix=CHECK64
+
+; i8* p; // p is 4 byte aligned
+; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3]
+define i32 @load_i32_by_i8_big_endian(i32*) {
+; CHECK-LABEL: load_i32_by_i8_big_endian:
+; CHECK: ldr r0, [r0]
+; CHECK-NEXT: mov pc, lr
+
+; CHECK64-LABEL: load_i32_by_i8_big_endian:
+; CHECK64: ldr		w0, [x0]
+; CHECK64-NEXT: ret
+  %2 = bitcast i32* %0 to i8*
+  %3 = load i8, i8* %2, align 4
+  %4 = zext i8 %3 to i32
+  %5 = shl nuw nsw i32 %4, 24
+  %6 = getelementptr inbounds i8, i8* %2, i32 1
+  %7 = load i8, i8* %6, align 1
+  %8 = zext i8 %7 to i32
+  %9 = shl nuw nsw i32 %8, 16
+  %10 = or i32 %9, %5
+  %11 = getelementptr inbounds i8, i8* %2, i32 2
+  %12 = load i8, i8* %11, align 1
+  %13 = zext i8 %12 to i32
+  %14 = shl nuw nsw i32 %13, 8
+  %15 = or i32 %10, %14
+  %16 = getelementptr inbounds i8, i8* %2, i32 3
+  %17 = load i8, i8* %16, align 1
+  %18 = zext i8 %17 to i32
+  %19 = or i32 %15, %18
+  ret i32 %19
+}
+
+; i8* p; // p is 4 byte aligned
+; ((i32) (((i16) p[0] << 8) | (i16) p[1]) << 16) | (i32) (((i16) p[3] << 8) | (i16) p[4])
+define i32 @load_i32_by_i16_by_i8_big_endian(i32*) {
+; CHECK-LABEL: load_i32_by_i16_by_i8_big_endian:
+; CHECK: ldr r0, [r0]
+; CHECK-NEXT: mov pc, lr
+
+; CHECK64-LABEL: load_i32_by_i16_by_i8_big_endian:
+; CHECK64: ldr		w0, [x0]
+; CHECK64-NEXT: ret
+  %2 = bitcast i32* %0 to i8*
+  %3 = load i8, i8* %2, align 4
+  %4 = zext i8 %3 to i16
+  %5 = getelementptr inbounds i8, i8* %2, i32 1
+  %6 = load i8, i8* %5, align 1
+  %7 = zext i8 %6 to i16
+  %8 = shl nuw nsw i16 %4, 8
+  %9 = or i16 %8, %7
+  %10 = getelementptr inbounds i8, i8* %2, i32 2
+  %11 = load i8, i8* %10, align 1
+  %12 = zext i8 %11 to i16
+  %13 = getelementptr inbounds i8, i8* %2, i32 3
+  %14 = load i8, i8* %13, align 1
+  %15 = zext i8 %14 to i16
+  %16 = shl nuw nsw i16 %12, 8
+  %17 = or i16 %16, %15
+  %18 = zext i16 %9 to i32
+  %19 = zext i16 %17 to i32
+  %20 = shl nuw nsw i32 %18, 16
+  %21 = or i32 %20, %19
+  ret i32 %21
+}
+
+; i16* p; // p is 4 byte aligned
+; ((i32) p[0] << 16) | (i32) p[1]
+define i32 @load_i32_by_i16(i32*) {
+; CHECK-LABEL: load_i32_by_i16:
+; CHECK: ldr r0, [r0]
+; CHECK-NEXT: mov pc, lr
+
+; CHECK64-LABEL: load_i32_by_i16:
+; CHECK64: ldr		w0, [x0]
+; CHECK64-NEXT: ret
+  %2 = bitcast i32* %0 to i16*
+  %3 = load i16, i16* %2, align 4
+  %4 = zext i16 %3 to i32
+  %5 = getelementptr inbounds i16, i16* %2, i32 1
+  %6 = load i16, i16* %5, align 1
+  %7 = zext i16 %6 to i32
+  %8 = shl nuw nsw i32 %4, 16
+  %9 = or i32 %8, %7
+  ret i32 %9
+}
+
+; i16* p_16; // p_16 is 4 byte aligned
+; i8* p_8 = (i8*) p_16;
+; (i32) (p_16[0] << 16) | ((i32) p[2] << 8) | (i32) p[3]
+define i32 @load_i32_by_i16_i8(i32*) {
+; CHECK-LABEL: load_i32_by_i16_i8:
+; CHECK: ldr r0, [r0]
+; CHECK-NEXT: mov pc, lr
+
+; CHECK64-LABEL: load_i32_by_i16_i8:
+; CHECK64: ldr		w0, [x0]
+; CHECK64-NEXT: ret
+  %2 = bitcast i32* %0 to i16*
+  %3 = bitcast i32* %0 to i8*
+  %4 = load i16, i16* %2, align 4
+  %5 = zext i16 %4 to i32
+  %6 = shl nuw nsw i32 %5, 16
+  %7 = getelementptr inbounds i8, i8* %3, i32 2
+  %8 = load i8, i8* %7, align 1
+  %9 = zext i8 %8 to i32
+  %10 = shl nuw nsw i32 %9, 8
+  %11 = getelementptr inbounds i8, i8* %3, i32 3
+  %12 = load i8, i8* %11, align 1
+  %13 = zext i8 %12 to i32
+  %14 = or i32 %10, %13
+  %15 = or i32 %14, %6
+  ret i32 %15
+}
+
+; i8* p; // p is 8 byte aligned
+; (i64) p[0] | ((i64) p[1] << 8) | ((i64) p[2] << 16) | ((i64) p[3] << 24) | ((i64) p[4] << 32) | ((i64) p[5] << 40) | ((i64) p[6] << 48) | ((i64) p[7] << 56)
+define i64 @load_i64_by_i8_bswap(i64*) {
+; CHECK-LABEL: load_i64_by_i8_bswap:
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: orr
+; CHECK: mov pc, lr
+
+; CHECK64-LABEL: load_i64_by_i8_bswap:
+; CHECK64: ldr		x8, [x0]
+; CHECK64-NEXT: rev	x0, x8
+; CHECK64-NEXT: ret
+  %2 = bitcast i64* %0 to i8*
+  %3 = load i8, i8* %2, align 8
+  %4 = zext i8 %3 to i64
+  %5 = getelementptr inbounds i8, i8* %2, i64 1
+  %6 = load i8, i8* %5, align 1
+  %7 = zext i8 %6 to i64
+  %8 = shl nuw nsw i64 %7, 8
+  %9 = or i64 %8, %4
+  %10 = getelementptr inbounds i8, i8* %2, i64 2
+  %11 = load i8, i8* %10, align 1
+  %12 = zext i8 %11 to i64
+  %13 = shl nuw nsw i64 %12, 16
+  %14 = or i64 %9, %13
+  %15 = getelementptr inbounds i8, i8* %2, i64 3
+  %16 = load i8, i8* %15, align 1
+  %17 = zext i8 %16 to i64
+  %18 = shl nuw nsw i64 %17, 24
+  %19 = or i64 %14, %18
+  %20 = getelementptr inbounds i8, i8* %2, i64 4
+  %21 = load i8, i8* %20, align 1
+  %22 = zext i8 %21 to i64
+  %23 = shl nuw nsw i64 %22, 32
+  %24 = or i64 %19, %23
+  %25 = getelementptr inbounds i8, i8* %2, i64 5
+  %26 = load i8, i8* %25, align 1
+  %27 = zext i8 %26 to i64
+  %28 = shl nuw nsw i64 %27, 40
+  %29 = or i64 %24, %28
+  %30 = getelementptr inbounds i8, i8* %2, i64 6
+  %31 = load i8, i8* %30, align 1
+  %32 = zext i8 %31 to i64
+  %33 = shl nuw nsw i64 %32, 48
+  %34 = or i64 %29, %33
+  %35 = getelementptr inbounds i8, i8* %2, i64 7
+  %36 = load i8, i8* %35, align 1
+  %37 = zext i8 %36 to i64
+  %38 = shl nuw i64 %37, 56
+  %39 = or i64 %34, %38
+  ret i64 %39
+}
+
+; i8* p; // p is 8 byte aligned
+; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7]
+define i64 @load_i64_by_i8(i64*) {
+; CHECK-LABEL: load_i64_by_i8:
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: orr
+; CHECK: mov pc, lr
+
+; CHECK64-LABEL: load_i64_by_i8:
+; CHECK64: ldr		x0, [x0]
+; CHECK64-NEXT: ret
+  %2 = bitcast i64* %0 to i8*
+  %3 = load i8, i8* %2, align 8
+  %4 = zext i8 %3 to i64
+  %5 = shl nuw i64 %4, 56
+  %6 = getelementptr inbounds i8, i8* %2, i64 1
+  %7 = load i8, i8* %6, align 1
+  %8 = zext i8 %7 to i64
+  %9 = shl nuw nsw i64 %8, 48
+  %10 = or i64 %9, %5
+  %11 = getelementptr inbounds i8, i8* %2, i64 2
+  %12 = load i8, i8* %11, align 1
+  %13 = zext i8 %12 to i64
+  %14 = shl nuw nsw i64 %13, 40
+  %15 = or i64 %10, %14
+  %16 = getelementptr inbounds i8, i8* %2, i64 3
+  %17 = load i8, i8* %16, align 1
+  %18 = zext i8 %17 to i64
+  %19 = shl nuw nsw i64 %18, 32
+  %20 = or i64 %15, %19
+  %21 = getelementptr inbounds i8, i8* %2, i64 4
+  %22 = load i8, i8* %21, align 1
+  %23 = zext i8 %22 to i64
+  %24 = shl nuw nsw i64 %23, 24
+  %25 = or i64 %20, %24
+  %26 = getelementptr inbounds i8, i8* %2, i64 5
+  %27 = load i8, i8* %26, align 1
+  %28 = zext i8 %27 to i64
+  %29 = shl nuw nsw i64 %28, 16
+  %30 = or i64 %25, %29
+  %31 = getelementptr inbounds i8, i8* %2, i64 6
+  %32 = load i8, i8* %31, align 1
+  %33 = zext i8 %32 to i64
+  %34 = shl nuw nsw i64 %33, 8
+  %35 = or i64 %30, %34
+  %36 = getelementptr inbounds i8, i8* %2, i64 7
+  %37 = load i8, i8* %36, align 1
+  %38 = zext i8 %37 to i64
+  %39 = or i64 %35, %38
+  ret i64 %39
+}

Added: llvm/trunk/test/CodeGen/ARM/load-combine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/ARM/load-combine.ll?rev=289538&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/ARM/load-combine.ll (added)
+++ llvm/trunk/test/CodeGen/ARM/load-combine.ll Tue Dec 13 08:21:14 2016
@@ -0,0 +1,226 @@
+; RUN: llc < %s -mtriple=arm-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown | FileCheck %s --check-prefix=CHECK64
+
+; i8* p; // p is 1 byte aligned
+; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24)
+define i32 @load_i32_by_i8_unaligned(i32*) {
+; CHECK-LABEL: load_i32_by_i8_unaligned:
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: orr
+; CHECK: mov pc, lr
+
+; CHECK64-LABEL: load_i32_by_i8_unaligned:
+; CHECK64: ldr		w0, [x0]
+; CHECK64-NEXT: ret
+  %2 = bitcast i32* %0 to i8*
+  %3 = getelementptr inbounds i8, i8* %2, i32 0
+  %4 = load i8, i8* %2, align 1
+  %5 = zext i8 %4 to i32
+  %6 = getelementptr inbounds i8, i8* %2, i32 1
+  %7 = load i8, i8* %6, align 1
+  %8 = zext i8 %7 to i32
+  %9 = shl nuw nsw i32 %8, 8
+  %10 = or i32 %9, %5
+  %11 = getelementptr inbounds i8, i8* %2, i32 2
+  %12 = load i8, i8* %11, align 1
+  %13 = zext i8 %12 to i32
+  %14 = shl nuw nsw i32 %13, 16
+  %15 = or i32 %10, %14
+  %16 = getelementptr inbounds i8, i8* %2, i32 3
+  %17 = load i8, i8* %16, align 1
+  %18 = zext i8 %17 to i32
+  %19 = shl nuw nsw i32 %18, 24
+  %20 = or i32 %15, %19
+  ret i32 %20
+}
+
+; i8* p; // p is 4 byte aligned
+; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24)
+define i32 @load_i32_by_i8_aligned(i32*) {
+; CHECK-LABEL: load_i32_by_i8_aligned:
+; CHECK: ldr  r0, [r0]
+; CHECK: mov pc, lr
+
+; CHECK64-LABEL: load_i32_by_i8_aligned:
+; CHECK64: ldr    w0, [x0]
+; CHECK64-NEXT: ret
+  %2 = bitcast i32* %0 to i8*
+  %3 = getelementptr inbounds i8, i8* %2, i32 0
+  %4 = load i8, i8* %2, align 4
+  %5 = zext i8 %4 to i32
+  %6 = getelementptr inbounds i8, i8* %2, i32 1
+  %7 = load i8, i8* %6, align 1
+  %8 = zext i8 %7 to i32
+  %9 = shl nuw nsw i32 %8, 8
+  %10 = or i32 %9, %5
+  %11 = getelementptr inbounds i8, i8* %2, i32 2
+  %12 = load i8, i8* %11, align 1
+  %13 = zext i8 %12 to i32
+  %14 = shl nuw nsw i32 %13, 16
+  %15 = or i32 %10, %14
+  %16 = getelementptr inbounds i8, i8* %2, i32 3
+  %17 = load i8, i8* %16, align 1
+  %18 = zext i8 %17 to i32
+  %19 = shl nuw nsw i32 %18, 24
+  %20 = or i32 %15, %19
+  ret i32 %20
+}
+
+; i8* p; // p is 4 byte aligned
+; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3]
+define i32 @load_i32_by_i8_bswap(i32*) {
+; BSWAP is not supported by 32 bit target
+; CHECK-LABEL: load_i32_by_i8_bswap:
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: orr
+; CHECK: mov pc, lr
+
+; CHECK64-LABEL: load_i32_by_i8_bswap:
+; CHECK64: ldr		w8, [x0]
+; CHECK64-NEXT: rev	w0, w8
+; CHECK64-NEXT: ret
+  %2 = bitcast i32* %0 to i8*
+  %3 = load i8, i8* %2, align 4
+  %4 = zext i8 %3 to i32
+  %5 = shl nuw nsw i32 %4, 24
+  %6 = getelementptr inbounds i8, i8* %2, i32 1
+  %7 = load i8, i8* %6, align 1
+  %8 = zext i8 %7 to i32
+  %9 = shl nuw nsw i32 %8, 16
+  %10 = or i32 %9, %5
+  %11 = getelementptr inbounds i8, i8* %2, i32 2
+  %12 = load i8, i8* %11, align 1
+  %13 = zext i8 %12 to i32
+  %14 = shl nuw nsw i32 %13, 8
+  %15 = or i32 %10, %14
+  %16 = getelementptr inbounds i8, i8* %2, i32 3
+  %17 = load i8, i8* %16, align 1
+  %18 = zext i8 %17 to i32
+  %19 = or i32 %15, %18
+  ret i32 %19
+}
+
+; i8* p; // p is 8 byte aligned
+; (i64) p[0] | ((i64) p[1] << 8) | ((i64) p[2] << 16) | ((i64) p[3] << 24) | ((i64) p[4] << 32) | ((i64) p[5] << 40) | ((i64) p[6] << 48) | ((i64) p[7] << 56)
+define i64 @load_i64_by_i8(i64*) {
+; CHECK-LABEL: load_i64_by_i8:
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: orr
+; CHECK: mov pc, lr
+
+; CHECK64-LABEL: load_i64_by_i8:
+; CHECK64: ldr		x0, [x0]
+; CHECK64-NEXT: ret
+  %2 = bitcast i64* %0 to i8*
+  %3 = load i8, i8* %2, align 8
+  %4 = zext i8 %3 to i64
+  %5 = getelementptr inbounds i8, i8* %2, i64 1
+  %6 = load i8, i8* %5, align 1
+  %7 = zext i8 %6 to i64
+  %8 = shl nuw nsw i64 %7, 8
+  %9 = or i64 %8, %4
+  %10 = getelementptr inbounds i8, i8* %2, i64 2
+  %11 = load i8, i8* %10, align 1
+  %12 = zext i8 %11 to i64
+  %13 = shl nuw nsw i64 %12, 16
+  %14 = or i64 %9, %13
+  %15 = getelementptr inbounds i8, i8* %2, i64 3
+  %16 = load i8, i8* %15, align 1
+  %17 = zext i8 %16 to i64
+  %18 = shl nuw nsw i64 %17, 24
+  %19 = or i64 %14, %18
+  %20 = getelementptr inbounds i8, i8* %2, i64 4
+  %21 = load i8, i8* %20, align 1
+  %22 = zext i8 %21 to i64
+  %23 = shl nuw nsw i64 %22, 32
+  %24 = or i64 %19, %23
+  %25 = getelementptr inbounds i8, i8* %2, i64 5
+  %26 = load i8, i8* %25, align 1
+  %27 = zext i8 %26 to i64
+  %28 = shl nuw nsw i64 %27, 40
+  %29 = or i64 %24, %28
+  %30 = getelementptr inbounds i8, i8* %2, i64 6
+  %31 = load i8, i8* %30, align 1
+  %32 = zext i8 %31 to i64
+  %33 = shl nuw nsw i64 %32, 48
+  %34 = or i64 %29, %33
+  %35 = getelementptr inbounds i8, i8* %2, i64 7
+  %36 = load i8, i8* %35, align 1
+  %37 = zext i8 %36 to i64
+  %38 = shl nuw i64 %37, 56
+  %39 = or i64 %34, %38
+  ret i64 %39
+}
+
+; i8* p; // p is 8 byte aligned
+; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7]
+define i64 @load_i64_by_i8_bswap(i64*) {
+; CHECK-LABEL: load_i64_by_i8_bswap:
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: ldrb{{.*}}r0
+; CHECK: orr
+; CHECK: mov pc, lr
+
+; CHECK64-LABEL: load_i64_by_i8_bswap:
+; CHECK64: ldr		x8, [x0]
+; CHECK64-NEXT: rev	x0, x8
+; CHECK64-NEXT: ret
+  %2 = bitcast i64* %0 to i8*
+  %3 = load i8, i8* %2, align 8
+  %4 = zext i8 %3 to i64
+  %5 = shl nuw i64 %4, 56
+  %6 = getelementptr inbounds i8, i8* %2, i64 1
+  %7 = load i8, i8* %6, align 1
+  %8 = zext i8 %7 to i64
+  %9 = shl nuw nsw i64 %8, 48
+  %10 = or i64 %9, %5
+  %11 = getelementptr inbounds i8, i8* %2, i64 2
+  %12 = load i8, i8* %11, align 1
+  %13 = zext i8 %12 to i64
+  %14 = shl nuw nsw i64 %13, 40
+  %15 = or i64 %10, %14
+  %16 = getelementptr inbounds i8, i8* %2, i64 3
+  %17 = load i8, i8* %16, align 1
+  %18 = zext i8 %17 to i64
+  %19 = shl nuw nsw i64 %18, 32
+  %20 = or i64 %15, %19
+  %21 = getelementptr inbounds i8, i8* %2, i64 4
+  %22 = load i8, i8* %21, align 1
+  %23 = zext i8 %22 to i64
+  %24 = shl nuw nsw i64 %23, 24
+  %25 = or i64 %20, %24
+  %26 = getelementptr inbounds i8, i8* %2, i64 5
+  %27 = load i8, i8* %26, align 1
+  %28 = zext i8 %27 to i64
+  %29 = shl nuw nsw i64 %28, 16
+  %30 = or i64 %25, %29
+  %31 = getelementptr inbounds i8, i8* %2, i64 6
+  %32 = load i8, i8* %31, align 1
+  %33 = zext i8 %32 to i64
+  %34 = shl nuw nsw i64 %33, 8
+  %35 = or i64 %30, %34
+  %36 = getelementptr inbounds i8, i8* %2, i64 7
+  %37 = load i8, i8* %36, align 1
+  %38 = zext i8 %37 to i64
+  %39 = or i64 %35, %38
+  ret i64 %39
+}

Added: llvm/trunk/test/CodeGen/X86/load-combine.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/X86/load-combine.ll?rev=289538&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/X86/load-combine.ll (added)
+++ llvm/trunk/test/CodeGen/X86/load-combine.ll Tue Dec 13 08:21:14 2016
@@ -0,0 +1,733 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=CHECK64
+
+; i8* p;
+; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24)
+define i32 @load_i32_by_i8(i32*) {
+; CHECK-LABEL: load_i32_by_i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_i8:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movl (%rdi), %eax
+; CHECK64-NEXT:    retq
+
+  %2 = bitcast i32* %0 to i8*
+  %3 = load i8, i8* %2, align 1
+  %4 = zext i8 %3 to i32
+  %5 = getelementptr inbounds i8, i8* %2, i32 1
+  %6 = load i8, i8* %5, align 1
+  %7 = zext i8 %6 to i32
+  %8 = shl nuw nsw i32 %7, 8
+  %9 = or i32 %8, %4
+  %10 = getelementptr inbounds i8, i8* %2, i32 2
+  %11 = load i8, i8* %10, align 1
+  %12 = zext i8 %11 to i32
+  %13 = shl nuw nsw i32 %12, 16
+  %14 = or i32 %9, %13
+  %15 = getelementptr inbounds i8, i8* %2, i32 3
+  %16 = load i8, i8* %15, align 1
+  %17 = zext i8 %16 to i32
+  %18 = shl nuw nsw i32 %17, 24
+  %19 = or i32 %14, %18
+  ret i32 %19
+}
+
+; i8* p;
+; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3]
+define i32 @load_i32_by_i8_bswap(i32*) {
+; CHECK-LABEL: load_i32_by_i8_bswap:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_i8_bswap:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movl (%rdi), %eax
+; CHECK64-NEXT:    bswapl %eax
+; CHECK64-NEXT:    retq
+
+  %2 = bitcast i32* %0 to i8*
+  %3 = load i8, i8* %2, align 1
+  %4 = zext i8 %3 to i32
+  %5 = shl nuw nsw i32 %4, 24
+  %6 = getelementptr inbounds i8, i8* %2, i32 1
+  %7 = load i8, i8* %6, align 1
+  %8 = zext i8 %7 to i32
+  %9 = shl nuw nsw i32 %8, 16
+  %10 = or i32 %9, %5
+  %11 = getelementptr inbounds i8, i8* %2, i32 2
+  %12 = load i8, i8* %11, align 1
+  %13 = zext i8 %12 to i32
+  %14 = shl nuw nsw i32 %13, 8
+  %15 = or i32 %10, %14
+  %16 = getelementptr inbounds i8, i8* %2, i32 3
+  %17 = load i8, i8* %16, align 1
+  %18 = zext i8 %17 to i32
+  %19 = or i32 %15, %18
+  ret i32 %19
+}
+
+; i16* p;
+; (i32) p[0] | ((i32) p[1] << 16)
+define i32 @load_i32_by_i16(i32*) {
+; CHECK-LABEL: load_i32_by_i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_i16:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movl (%rdi), %eax
+; CHECK64-NEXT:    retq
+
+  %2 = bitcast i32* %0 to i16*
+  %3 = load i16, i16* %2, align 1
+  %4 = zext i16 %3 to i32
+  %5 = getelementptr inbounds i16, i16* %2, i32 1
+  %6 = load i16, i16* %5, align 1
+  %7 = zext i16 %6 to i32
+  %8 = shl nuw nsw i32 %7, 16
+  %9 = or i32 %8, %4
+  ret i32 %9
+}
+
+; i16* p_16;
+; i8* p_8 = (i8*) p_16;
+; (i32) p_16[0] | ((i32) p[2] << 16) | ((i32) p[3] << 24)
+define i32 @load_i32_by_i16_i8(i32*) {
+; CHECK-LABEL: load_i32_by_i16_i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_i16_i8:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movl (%rdi), %eax
+; CHECK64-NEXT:    retq
+
+  %2 = bitcast i32* %0 to i16*
+  %3 = bitcast i32* %0 to i8*
+  %4 = load i16, i16* %2, align 1
+  %5 = zext i16 %4 to i32
+  %6 = getelementptr inbounds i8, i8* %3, i32 2
+  %7 = load i8, i8* %6, align 1
+  %8 = zext i8 %7 to i32
+  %9 = shl nuw nsw i32 %8, 16
+  %10 = getelementptr inbounds i8, i8* %3, i32 3
+  %11 = load i8, i8* %10, align 1
+  %12 = zext i8 %11 to i32
+  %13 = shl nuw nsw i32 %12, 24
+  %14 = or i32 %9, %13
+  %15 = or i32 %14, %5
+  ret i32 %15
+}
+
+
+; i8* p;
+; (i32) ((i16) p[0] | ((i16) p[1] << 8)) | (((i32) ((i16) p[3] | ((i16) p[4] << 8)) << 16)
+define i32 @load_i32_by_i16_by_i8(i32*) {
+; CHECK-LABEL: load_i32_by_i16_by_i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_i16_by_i8:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movl (%rdi), %eax
+; CHECK64-NEXT:    retq
+
+  %2 = bitcast i32* %0 to i8*
+  %3 = load i8, i8* %2, align 1
+  %4 = zext i8 %3 to i16
+  %5 = getelementptr inbounds i8, i8* %2, i32 1
+  %6 = load i8, i8* %5, align 1
+  %7 = zext i8 %6 to i16
+  %8 = shl nuw nsw i16 %7, 8
+  %9 = or i16 %8, %4
+  %10 = getelementptr inbounds i8, i8* %2, i32 2
+  %11 = load i8, i8* %10, align 1
+  %12 = zext i8 %11 to i16
+  %13 = getelementptr inbounds i8, i8* %2, i32 3
+  %14 = load i8, i8* %13, align 1
+  %15 = zext i8 %14 to i16
+  %16 = shl nuw nsw i16 %15, 8
+  %17 = or i16 %16, %12
+  %18 = zext i16 %9 to i32
+  %19 = zext i16 %17 to i32
+  %20 = shl nuw nsw i32 %19, 16
+  %21 = or i32 %20, %18
+  ret i32 %21
+}
+
+; i8* p;
+; ((i32) (((i16) p[0] << 8) | (i16) p[1]) << 16) | (i32) (((i16) p[3] << 8) | (i16) p[4])
+define i32 @load_i32_by_i16_by_i8_bswap(i32*) {
+; CHECK-LABEL: load_i32_by_i16_by_i8_bswap:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_i16_by_i8_bswap:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movl (%rdi), %eax
+; CHECK64-NEXT:    bswapl %eax
+; CHECK64-NEXT:    retq
+
+  %2 = bitcast i32* %0 to i8*
+  %3 = load i8, i8* %2, align 1
+  %4 = zext i8 %3 to i16
+  %5 = getelementptr inbounds i8, i8* %2, i32 1
+  %6 = load i8, i8* %5, align 1
+  %7 = zext i8 %6 to i16
+  %8 = shl nuw nsw i16 %4, 8
+  %9 = or i16 %8, %7
+  %10 = getelementptr inbounds i8, i8* %2, i32 2
+  %11 = load i8, i8* %10, align 1
+  %12 = zext i8 %11 to i16
+  %13 = getelementptr inbounds i8, i8* %2, i32 3
+  %14 = load i8, i8* %13, align 1
+  %15 = zext i8 %14 to i16
+  %16 = shl nuw nsw i16 %12, 8
+  %17 = or i16 %16, %15
+  %18 = zext i16 %9 to i32
+  %19 = zext i16 %17 to i32
+  %20 = shl nuw nsw i32 %18, 16
+  %21 = or i32 %20, %19
+  ret i32 %21
+}
+
+; i8* p;
+; (i64) p[0] | ((i64) p[1] << 8) | ((i64) p[2] << 16) | ((i64) p[3] << 24) | ((i64) p[4] << 32) | ((i64) p[5] << 40) | ((i64) p[6] << 48) | ((i64) p[7] << 56)
+define i64 @load_i64_by_i8(i64*) {
+; CHECK-LABEL: load_i64_by_i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:  .Lcfi0:
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:  .Lcfi1:
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:  .Lcfi2:
+; CHECK-NEXT:    .cfi_offset %esi, -12
+; CHECK-NEXT:  .Lcfi3:
+; CHECK-NEXT:    .cfi_offset %edi, -8
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movzbl (%ecx), %eax
+; CHECK-NEXT:    movzbl 1(%ecx), %edx
+; CHECK-NEXT:    shll $8, %edx
+; CHECK-NEXT:    orl %eax, %edx
+; CHECK-NEXT:    movzbl 2(%ecx), %esi
+; CHECK-NEXT:    shll $16, %esi
+; CHECK-NEXT:    orl %edx, %esi
+; CHECK-NEXT:    movzbl 3(%ecx), %eax
+; CHECK-NEXT:    shll $24, %eax
+; CHECK-NEXT:    orl %esi, %eax
+; CHECK-NEXT:    movzbl 4(%ecx), %edx
+; CHECK-NEXT:    movzbl 5(%ecx), %esi
+; CHECK-NEXT:    shll $8, %esi
+; CHECK-NEXT:    orl %edx, %esi
+; CHECK-NEXT:    movzbl 6(%ecx), %edi
+; CHECK-NEXT:    shll $16, %edi
+; CHECK-NEXT:    orl %esi, %edi
+; CHECK-NEXT:    movzbl 7(%ecx), %edx
+; CHECK-NEXT:    shll $24, %edx
+; CHECK-NEXT:    orl %edi, %edx
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i64_by_i8:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movq (%rdi), %rax
+; CHECK64-NEXT:    retq
+
+  %2 = bitcast i64* %0 to i8*
+  %3 = load i8, i8* %2, align 1
+  %4 = zext i8 %3 to i64
+  %5 = getelementptr inbounds i8, i8* %2, i64 1
+  %6 = load i8, i8* %5, align 1
+  %7 = zext i8 %6 to i64
+  %8 = shl nuw nsw i64 %7, 8
+  %9 = or i64 %8, %4
+  %10 = getelementptr inbounds i8, i8* %2, i64 2
+  %11 = load i8, i8* %10, align 1
+  %12 = zext i8 %11 to i64
+  %13 = shl nuw nsw i64 %12, 16
+  %14 = or i64 %9, %13
+  %15 = getelementptr inbounds i8, i8* %2, i64 3
+  %16 = load i8, i8* %15, align 1
+  %17 = zext i8 %16 to i64
+  %18 = shl nuw nsw i64 %17, 24
+  %19 = or i64 %14, %18
+  %20 = getelementptr inbounds i8, i8* %2, i64 4
+  %21 = load i8, i8* %20, align 1
+  %22 = zext i8 %21 to i64
+  %23 = shl nuw nsw i64 %22, 32
+  %24 = or i64 %19, %23
+  %25 = getelementptr inbounds i8, i8* %2, i64 5
+  %26 = load i8, i8* %25, align 1
+  %27 = zext i8 %26 to i64
+  %28 = shl nuw nsw i64 %27, 40
+  %29 = or i64 %24, %28
+  %30 = getelementptr inbounds i8, i8* %2, i64 6
+  %31 = load i8, i8* %30, align 1
+  %32 = zext i8 %31 to i64
+  %33 = shl nuw nsw i64 %32, 48
+  %34 = or i64 %29, %33
+  %35 = getelementptr inbounds i8, i8* %2, i64 7
+  %36 = load i8, i8* %35, align 1
+  %37 = zext i8 %36 to i64
+  %38 = shl nuw i64 %37, 56
+  %39 = or i64 %34, %38
+  ret i64 %39
+}
+
+; i8* p;
+; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7]
+define i64 @load_i64_by_i8_bswap(i64*) {
+; CHECK-LABEL: load_i64_by_i8_bswap:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:  .Lcfi4:
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:  .Lcfi5:
+; CHECK-NEXT:    .cfi_offset %esi, -8
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movzbl (%eax), %ecx
+; CHECK-NEXT:    shll $24, %ecx
+; CHECK-NEXT:    movzbl 1(%eax), %edx
+; CHECK-NEXT:    shll $16, %edx
+; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    movzbl 2(%eax), %ecx
+; CHECK-NEXT:    shll $8, %ecx
+; CHECK-NEXT:    orl %edx, %ecx
+; CHECK-NEXT:    movzbl 3(%eax), %edx
+; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    movzbl 4(%eax), %ecx
+; CHECK-NEXT:    shll $24, %ecx
+; CHECK-NEXT:    movzbl 5(%eax), %esi
+; CHECK-NEXT:    shll $16, %esi
+; CHECK-NEXT:    orl %ecx, %esi
+; CHECK-NEXT:    movzbl 6(%eax), %ecx
+; CHECK-NEXT:    shll $8, %ecx
+; CHECK-NEXT:    orl %esi, %ecx
+; CHECK-NEXT:    movzbl 7(%eax), %eax
+; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i64_by_i8_bswap:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movq (%rdi), %rax
+; CHECK64-NEXT:    bswapq %rax
+; CHECK64-NEXT:    retq
+
+  %2 = bitcast i64* %0 to i8*
+  %3 = load i8, i8* %2, align 1
+  %4 = zext i8 %3 to i64
+  %5 = shl nuw i64 %4, 56
+  %6 = getelementptr inbounds i8, i8* %2, i64 1
+  %7 = load i8, i8* %6, align 1
+  %8 = zext i8 %7 to i64
+  %9 = shl nuw nsw i64 %8, 48
+  %10 = or i64 %9, %5
+  %11 = getelementptr inbounds i8, i8* %2, i64 2
+  %12 = load i8, i8* %11, align 1
+  %13 = zext i8 %12 to i64
+  %14 = shl nuw nsw i64 %13, 40
+  %15 = or i64 %10, %14
+  %16 = getelementptr inbounds i8, i8* %2, i64 3
+  %17 = load i8, i8* %16, align 1
+  %18 = zext i8 %17 to i64
+  %19 = shl nuw nsw i64 %18, 32
+  %20 = or i64 %15, %19
+  %21 = getelementptr inbounds i8, i8* %2, i64 4
+  %22 = load i8, i8* %21, align 1
+  %23 = zext i8 %22 to i64
+  %24 = shl nuw nsw i64 %23, 24
+  %25 = or i64 %20, %24
+  %26 = getelementptr inbounds i8, i8* %2, i64 5
+  %27 = load i8, i8* %26, align 1
+  %28 = zext i8 %27 to i64
+  %29 = shl nuw nsw i64 %28, 16
+  %30 = or i64 %25, %29
+  %31 = getelementptr inbounds i8, i8* %2, i64 6
+  %32 = load i8, i8* %31, align 1
+  %33 = zext i8 %32 to i64
+  %34 = shl nuw nsw i64 %33, 8
+  %35 = or i64 %30, %34
+  %36 = getelementptr inbounds i8, i8* %2, i64 7
+  %37 = load i8, i8* %36, align 1
+  %38 = zext i8 %37 to i64
+  %39 = or i64 %35, %38
+  ret i64 %39
+}
+
+; Part of the load by bytes pattern is used outside of the pattern
+; i8* p;
+; i32 x = (i32) p[1]
+; res = ((i32) p[0] << 24) | (x << 16) | ((i32) p[2] << 8) | (i32) p[3]
+; x | res
+define i32 @load_i32_by_i8_bswap_uses(i32*) {
+; CHECK-LABEL: load_i32_by_i8_bswap_uses:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:  .Lcfi6:
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:  .Lcfi7:
+; CHECK-NEXT:    .cfi_offset %esi, -8
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movzbl (%eax), %ecx
+; CHECK-NEXT:    shll $24, %ecx
+; CHECK-NEXT:    movzbl 1(%eax), %edx
+; CHECK-NEXT:    movl %edx, %esi
+; CHECK-NEXT:    shll $16, %esi
+; CHECK-NEXT:    orl %ecx, %esi
+; CHECK-NEXT:    movzbl 2(%eax), %ecx
+; CHECK-NEXT:    shll $8, %ecx
+; CHECK-NEXT:    orl %esi, %ecx
+; CHECK-NEXT:    movzbl 3(%eax), %eax
+; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    orl %edx, %eax
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_i8_bswap_uses:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movzbl (%rdi), %eax
+; CHECK64-NEXT:    shll $24, %eax
+; CHECK64-NEXT:    movzbl 1(%rdi), %ecx
+; CHECK64-NEXT:    movl %ecx, %edx
+; CHECK64-NEXT:    shll $16, %edx
+; CHECK64-NEXT:    orl %eax, %edx
+; CHECK64-NEXT:    movzbl 2(%rdi), %esi
+; CHECK64-NEXT:    shll $8, %esi
+; CHECK64-NEXT:    orl %edx, %esi
+; CHECK64-NEXT:    movzbl 3(%rdi), %eax
+; CHECK64-NEXT:    orl %esi, %eax
+; CHECK64-NEXT:    orl %ecx, %eax
+; CHECK64-NEXT:    retq
+
+  %2 = bitcast i32* %0 to i8*
+  %3 = load i8, i8* %2, align 1
+  %4 = zext i8 %3 to i32
+  %5 = shl nuw nsw i32 %4, 24
+  %6 = getelementptr inbounds i8, i8* %2, i32 1
+  %7 = load i8, i8* %6, align 1
+  %8 = zext i8 %7 to i32
+  %9 = shl nuw nsw i32 %8, 16
+  %10 = or i32 %9, %5
+  %11 = getelementptr inbounds i8, i8* %2, i32 2
+  %12 = load i8, i8* %11, align 1
+  %13 = zext i8 %12 to i32
+  %14 = shl nuw nsw i32 %13, 8
+  %15 = or i32 %10, %14
+  %16 = getelementptr inbounds i8, i8* %2, i32 3
+  %17 = load i8, i8* %16, align 1
+  %18 = zext i8 %17 to i32
+  %19 = or i32 %15, %18
+  ; Use individual part of the pattern outside of the pattern
+  %20 = or i32 %8, %19
+  ret i32 %20
+}
+
+; One of the loads is volatile
+; i8* p;
+; p0 = volatile *p;
+; ((i32) p0 << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3]
+define i32 @load_i32_by_i8_bswap_volatile(i32*) {
+; CHECK-LABEL: load_i32_by_i8_bswap_volatile:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movzbl (%eax), %ecx
+; CHECK-NEXT:    shll $24, %ecx
+; CHECK-NEXT:    movzbl 1(%eax), %edx
+; CHECK-NEXT:    shll $16, %edx
+; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    movzbl 2(%eax), %ecx
+; CHECK-NEXT:    shll $8, %ecx
+; CHECK-NEXT:    orl %edx, %ecx
+; CHECK-NEXT:    movzbl 3(%eax), %eax
+; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_i8_bswap_volatile:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movzbl (%rdi), %eax
+; CHECK64-NEXT:    shll $24, %eax
+; CHECK64-NEXT:    movzbl 1(%rdi), %ecx
+; CHECK64-NEXT:    shll $16, %ecx
+; CHECK64-NEXT:    orl %eax, %ecx
+; CHECK64-NEXT:    movzbl 2(%rdi), %edx
+; CHECK64-NEXT:    shll $8, %edx
+; CHECK64-NEXT:    orl %ecx, %edx
+; CHECK64-NEXT:    movzbl 3(%rdi), %eax
+; CHECK64-NEXT:    orl %edx, %eax
+; CHECK64-NEXT:    retq
+
+  %2 = bitcast i32* %0 to i8*
+  %3 = load volatile i8, i8* %2, align 1
+  %4 = zext i8 %3 to i32
+  %5 = shl nuw nsw i32 %4, 24
+  %6 = getelementptr inbounds i8, i8* %2, i32 1
+  %7 = load i8, i8* %6, align 1
+  %8 = zext i8 %7 to i32
+  %9 = shl nuw nsw i32 %8, 16
+  %10 = or i32 %9, %5
+  %11 = getelementptr inbounds i8, i8* %2, i32 2
+  %12 = load i8, i8* %11, align 1
+  %13 = zext i8 %12 to i32
+  %14 = shl nuw nsw i32 %13, 8
+  %15 = or i32 %10, %14
+  %16 = getelementptr inbounds i8, i8* %2, i32 3
+  %17 = load i8, i8* %16, align 1
+  %18 = zext i8 %17 to i32
+  %19 = or i32 %15, %18
+  ret i32 %19
+}
+
+; There is a store in between individual loads
+; i8* p, q;
+; res1 = ((i32) p[0] << 24) | ((i32) p[1] << 16)
+; *q = 0;
+; res2 = ((i32) p[2] << 8) | (i32) p[3]
+; res1 | res2
+define i32 @load_i32_by_i8_bswap_store_in_between(i32*, i32*) {
+; CHECK-LABEL: load_i32_by_i8_bswap_store_in_between:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:  .Lcfi8:
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:  .Lcfi9:
+; CHECK-NEXT:    .cfi_offset %esi, -8
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movzbl (%ecx), %edx
+; CHECK-NEXT:    shll $24, %edx
+; CHECK-NEXT:    movzbl 1(%ecx), %esi
+; CHECK-NEXT:    movl $0, (%eax)
+; CHECK-NEXT:    shll $16, %esi
+; CHECK-NEXT:    orl %edx, %esi
+; CHECK-NEXT:    movzbl 2(%ecx), %edx
+; CHECK-NEXT:    shll $8, %edx
+; CHECK-NEXT:    orl %esi, %edx
+; CHECK-NEXT:    movzbl 3(%ecx), %eax
+; CHECK-NEXT:    orl %edx, %eax
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_i8_bswap_store_in_between:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movzbl (%rdi), %eax
+; CHECK64-NEXT:    shll $24, %eax
+; CHECK64-NEXT:    movzbl 1(%rdi), %ecx
+; CHECK64-NEXT:    movl $0, (%rsi)
+; CHECK64-NEXT:    shll $16, %ecx
+; CHECK64-NEXT:    orl %eax, %ecx
+; CHECK64-NEXT:    movzbl 2(%rdi), %edx
+; CHECK64-NEXT:    shll $8, %edx
+; CHECK64-NEXT:    orl %ecx, %edx
+; CHECK64-NEXT:    movzbl 3(%rdi), %eax
+; CHECK64-NEXT:    orl %edx, %eax
+; CHECK64-NEXT:    retq
+
+  %3 = bitcast i32* %0 to i8*
+  %4 = load i8, i8* %3, align 1
+  %5 = zext i8 %4 to i32
+  %6 = shl nuw nsw i32 %5, 24
+  %7 = getelementptr inbounds i8, i8* %3, i32 1
+  %8 = load i8, i8* %7, align 1
+  ; This store will prevent folding of the pattern
+  store i32 0, i32* %1
+  %9 = zext i8 %8 to i32
+  %10 = shl nuw nsw i32 %9, 16
+  %11 = or i32 %10, %6
+  %12 = getelementptr inbounds i8, i8* %3, i32 2
+  %13 = load i8, i8* %12, align 1
+  %14 = zext i8 %13 to i32
+  %15 = shl nuw nsw i32 %14, 8
+  %16 = or i32 %11, %15
+  %17 = getelementptr inbounds i8, i8* %3, i32 3
+  %18 = load i8, i8* %17, align 1
+  %19 = zext i8 %18 to i32
+  %20 = or i32 %16, %19
+  ret i32 %20
+}
+
+; One of the loads is from an unrelated location
+; i8* p, q;
+; ((i32) p[0] << 24) | ((i32) q[1] << 16) | ((i32) p[2] << 8) | (i32) p[3]
+define i32 @load_i32_by_i8_bswap_unrelated_load(i32*, i32*) {
+; CHECK-LABEL: load_i32_by_i8_bswap_unrelated_load:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movzbl (%ecx), %edx
+; CHECK-NEXT:    shll $24, %edx
+; CHECK-NEXT:    movzbl 1(%eax), %eax
+; CHECK-NEXT:    shll $16, %eax
+; CHECK-NEXT:    orl %edx, %eax
+; CHECK-NEXT:    movzbl 2(%ecx), %edx
+; CHECK-NEXT:    shll $8, %edx
+; CHECK-NEXT:    orl %eax, %edx
+; CHECK-NEXT:    movzbl 3(%ecx), %eax
+; CHECK-NEXT:    orl %edx, %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_i8_bswap_unrelated_load:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movzbl (%rdi), %eax
+; CHECK64-NEXT:    shll $24, %eax
+; CHECK64-NEXT:    movzbl 1(%rsi), %ecx
+; CHECK64-NEXT:    shll $16, %ecx
+; CHECK64-NEXT:    orl %eax, %ecx
+; CHECK64-NEXT:    movzbl 2(%rdi), %edx
+; CHECK64-NEXT:    shll $8, %edx
+; CHECK64-NEXT:    orl %ecx, %edx
+; CHECK64-NEXT:    movzbl 3(%rdi), %eax
+; CHECK64-NEXT:    orl %edx, %eax
+; CHECK64-NEXT:    retq
+
+  %3 = bitcast i32* %0 to i8*
+  %4 = bitcast i32* %1 to i8*
+  %5 = load i8, i8* %3, align 1
+  %6 = zext i8 %5 to i32
+  %7 = shl nuw nsw i32 %6, 24
+  ; Load from an unrelated address
+  %8 = getelementptr inbounds i8, i8* %4, i32 1
+  %9 = load i8, i8* %8, align 1
+  %10 = zext i8 %9 to i32
+  %11 = shl nuw nsw i32 %10, 16
+  %12 = or i32 %11, %7
+  %13 = getelementptr inbounds i8, i8* %3, i32 2
+  %14 = load i8, i8* %13, align 1
+  %15 = zext i8 %14 to i32
+  %16 = shl nuw nsw i32 %15, 8
+  %17 = or i32 %12, %16
+  %18 = getelementptr inbounds i8, i8* %3, i32 3
+  %19 = load i8, i8* %18, align 1
+  %20 = zext i8 %19 to i32
+  %21 = or i32 %17, %20
+  ret i32 %21
+}
+
+; Non-zero offsets are not supported for now
+; i8* p;
+; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24)
+define i32 @load_i32_by_i8_unsupported_offset(i32*) {
+; CHECK-LABEL: load_i32_by_i8_unsupported_offset:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movzbl (%eax), %ecx
+; CHECK-NEXT:    movzbl 2(%eax), %edx
+; CHECK-NEXT:    shll $8, %edx
+; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    movzbl 3(%eax), %ecx
+; CHECK-NEXT:    shll $16, %ecx
+; CHECK-NEXT:    orl %edx, %ecx
+; CHECK-NEXT:    movzbl 4(%eax), %eax
+; CHECK-NEXT:    shll $24, %eax
+; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_i8_unsupported_offset:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movzbl (%rdi), %eax
+; CHECK64-NEXT:    movzbl 2(%rdi), %ecx
+; CHECK64-NEXT:    shll $8, %ecx
+; CHECK64-NEXT:    orl %eax, %ecx
+; CHECK64-NEXT:    movzbl 3(%rdi), %edx
+; CHECK64-NEXT:    shll $16, %edx
+; CHECK64-NEXT:    orl %ecx, %edx
+; CHECK64-NEXT:    movzbl 4(%rdi), %eax
+; CHECK64-NEXT:    shll $24, %eax
+; CHECK64-NEXT:    orl %edx, %eax
+; CHECK64-NEXT:    retq
+
+  %2 = bitcast i32* %0 to i8*
+  %3 = getelementptr inbounds i8, i8* %2, i32 1
+  %4 = load i8, i8* %2, align 1
+  %5 = zext i8 %4 to i32
+  %6 = getelementptr inbounds i8, i8* %2, i32 2
+  %7 = load i8, i8* %6, align 1
+  %8 = zext i8 %7 to i32
+  %9 = shl nuw nsw i32 %8, 8
+  %10 = or i32 %9, %5
+  %11 = getelementptr inbounds i8, i8* %2, i32 3
+  %12 = load i8, i8* %11, align 1
+  %13 = zext i8 %12 to i32
+  %14 = shl nuw nsw i32 %13, 16
+  %15 = or i32 %10, %14
+  %16 = getelementptr inbounds i8, i8* %2, i32 4
+  %17 = load i8, i8* %16, align 1
+  %18 = zext i8 %17 to i32
+  %19 = shl nuw nsw i32 %18, 24
+  %20 = or i32 %15, %19
+  ret i32 %20
+}
+
+; i8* p; i32 i;
+; ((i32) p[i] << 24) | ((i32) p[i + 1] << 16) | ((i32) p[i + 2] << 8) | (i32) p[i + 3]
+define i32 @load_i32_by_i8_bswap_base_index_offset(i32*, i32) {
+; CHECK-LABEL: load_i32_by_i8_bswap_base_index_offset:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl (%ecx,%eax), %eax
+; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_i8_bswap_base_index_offset:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movslq %esi, %rax
+; CHECK64-NEXT:    movzbl (%rdi,%rax), %ecx
+; CHECK64-NEXT:    shll $24, %ecx
+; CHECK64-NEXT:    movzbl 1(%rdi,%rax), %edx
+; CHECK64-NEXT:    shll $16, %edx
+; CHECK64-NEXT:    orl %ecx, %edx
+; CHECK64-NEXT:    movzbl 2(%rdi,%rax), %ecx
+; CHECK64-NEXT:    shll $8, %ecx
+; CHECK64-NEXT:    orl %edx, %ecx
+; CHECK64-NEXT:    movzbl 3(%rdi,%rax), %eax
+; CHECK64-NEXT:    orl %ecx, %eax
+; CHECK64-NEXT:    retq
+; Currently we don't fold the pattern for x86-64 target because we don't see
+; that the loads are adjacent. It happens because BaseIndexOffset doesn't look
+; through zexts.
+
+  %3 = bitcast i32* %0 to i8*
+  %4 = getelementptr inbounds i8, i8* %3, i32 %1
+  %5 = load i8, i8* %4, align 1
+  %6 = zext i8 %5 to i32
+  %7 = shl nuw nsw i32 %6, 24
+  %8 = add nuw nsw i32 %1, 1
+  %9 = getelementptr inbounds i8, i8* %3, i32 %8
+  %10 = load i8, i8* %9, align 1
+  %11 = zext i8 %10 to i32
+  %12 = shl nuw nsw i32 %11, 16
+  %13 = or i32 %12, %7
+  %14 = add nuw nsw i32 %1, 2
+  %15 = getelementptr inbounds i8, i8* %3, i32 %14
+  %16 = load i8, i8* %15, align 1
+  %17 = zext i8 %16 to i32
+  %18 = shl nuw nsw i32 %17, 8
+  %19 = or i32 %13, %18
+  %20 = add nuw nsw i32 %1, 3
+  %21 = getelementptr inbounds i8, i8* %3, i32 %20
+  %22 = load i8, i8* %21, align 1
+  %23 = zext i8 %22 to i32
+  %24 = or i32 %19, %23
+  ret i32 %24
+}




More information about the llvm-commits mailing list