[llvm] r304313 - [PPC] Inline expansion of memcmp

Wed May 31 10:12:38 PDT 2017

Author: syzaara
Date: Wed May 31 12:12:38 2017
New Revision: 304313

URL: http://llvm.org/viewvc/llvm-project?rev=304313&view=rev
Log:
[PPC] Inline expansion of memcmp

This patch does an inline expansion of memcmp.
It changes the memcmp library call into an inline expansion when the size is
known at compile time and is under a target specified threshold.
This expansion is implemented in CodeGenPrepare and expands into straight line
code. The target specifies a maximum load size and the expansion works by using
this size to load the two sources, compare, and exit early if a difference is
found. It also has a special case when the memcmp result is used in a compare
to zero equality.

Differential Revision: https://reviews.llvm.org/D28637

Added:
    llvm/trunk/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
    llvm/trunk/test/CodeGen/PowerPC/memcmp.ll
    llvm/trunk/test/CodeGen/PowerPC/memcmpIR.ll
Modified:
    llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
    llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
    llvm/trunk/include/llvm/Analysis/ValueTracking.h
    llvm/trunk/include/llvm/Target/TargetLowering.h
    llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
    llvm/trunk/lib/Analysis/ValueTracking.cpp
    llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
    llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
    llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
    llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
    llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h
    llvm/trunk/lib/Transforms/Utils/SimplifyLibCalls.cpp

Modified: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h?rev=304313&r1=304312&r2=304313&view=diff
==============================================================================

--- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h (original)
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h Wed May 31 12:12:38 2017
@@ -454,6 +454,9 @@ public:
   /// \brief Don't restrict interleaved unrolling to small loops.
   bool enableAggressiveInterleaving(bool LoopHasReductions) const;
 
+  /// \brief Enable inline expansion of memcmp
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const;
+
   /// \brief Enable matching of interleaved access groups.
   bool enableInterleavedAccessVectorization() const;
 
@@ -828,6 +831,7 @@ public:
                                                     unsigned VF) = 0;
   virtual bool supportsEfficientVectorElementLoadStore() = 0;
   virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
+  virtual bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
   virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
   virtual bool allowsMisalignedMemoryAccesses(LLVMContext &Context,
@@ -1047,6 +1051,9 @@ public:
   bool enableAggressiveInterleaving(bool LoopHasReductions) override {
     return Impl.enableAggressiveInterleaving(LoopHasReductions);
   }
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) override {
+    return Impl.expandMemCmp(I, MaxLoadSize);
+  }
   bool enableInterleavedAccessVectorization() override {
     return Impl.enableInterleavedAccessVectorization();
   }

Modified: llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h?rev=304313&r1=304312&r2=304313&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h (original)
+++ llvm/trunk/include/llvm/Analysis/TargetTransformInfoImpl.h Wed May 31 12:12:38 2017
@@ -274,6 +274,8 @@ public:
 
   bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }
 
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { return false; }
+
   bool enableInterleavedAccessVectorization() { return false; }
 
   bool isFPVectorizationPotentiallyUnsafe() { return false; }

Modified: llvm/trunk/include/llvm/Analysis/ValueTracking.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Analysis/ValueTracking.h?rev=304313&r1=304312&r2=304313&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Analysis/ValueTracking.h (original)
+++ llvm/trunk/include/llvm/Analysis/ValueTracking.h Wed May 31 12:12:38 2017
@@ -85,6 +85,8 @@ template <typename T> class ArrayRef;
                               const Instruction *CxtI = nullptr,
                               const DominatorTree *DT = nullptr);
 
+  bool isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI);
+  
   /// Return true if the given value is known to be non-zero when defined. For
   /// vectors, return true if every element is known to be non-zero when
   /// defined. For pointers, if the context instruction and dominator tree are

Modified: llvm/trunk/include/llvm/Target/TargetLowering.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/include/llvm/Target/TargetLowering.h?rev=304313&r1=304312&r2=304313&view=diff
==============================================================================
--- llvm/trunk/include/llvm/Target/TargetLowering.h (original)
+++ llvm/trunk/include/llvm/Target/TargetLowering.h Wed May 31 12:12:38 2017
@@ -1143,6 +1143,16 @@ public:
     return OptSize ? MaxStoresPerMemcpyOptSize : MaxStoresPerMemcpy;
   }
 
+  /// Get maximum # of load operations permitted for memcmp
+  ///
+  /// This function returns the maximum number of load operations permitted
+  /// to replace a call to memcmp. The value is set by the target at the
+  /// performance threshold for such a replacement. If OptSize is true,
+  /// return the limit for functions that have OptSize attribute.
+  unsigned getMaxExpandSizeMemcmp(bool OptSize) const {
+    return OptSize ? MaxLoadsPerMemcmpOptSize : MaxLoadsPerMemcmp;
+  }
+
   /// \brief Get maximum # of store operations permitted for llvm.memmove
   ///
   /// This function returns the maximum number of store operations permitted
@@ -2330,6 +2340,8 @@ protected:
   /// Maximum number of store operations that may be substituted for a call to
   /// memcpy, used for functions with OptSize attribute.
   unsigned MaxStoresPerMemcpyOptSize;
+  unsigned MaxLoadsPerMemcmp;
+  unsigned MaxLoadsPerMemcmpOptSize;
 
   /// \brief Specify maximum bytes of store instructions per memmove call.
   ///

Modified: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/TargetTransformInfo.cpp?rev=304313&r1=304312&r2=304313&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp Wed May 31 12:12:38 2017
@@ -215,6 +215,10 @@ bool TargetTransformInfo::enableAggressi
   return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
 }
 
+bool TargetTransformInfo::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) const {
+  return TTIImpl->expandMemCmp(I, MaxLoadSize);
+}
+
 bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
   return TTIImpl->enableInterleavedAccessVectorization();
 }

Modified: llvm/trunk/lib/Analysis/ValueTracking.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Analysis/ValueTracking.cpp?rev=304313&r1=304312&r2=304313&view=diff
==============================================================================
--- llvm/trunk/lib/Analysis/ValueTracking.cpp (original)
+++ llvm/trunk/lib/Analysis/ValueTracking.cpp Wed May 31 12:12:38 2017
@@ -172,6 +172,18 @@ bool llvm::haveNoCommonBitsSet(const Val
 }
 
 
+bool llvm::isOnlyUsedInZeroEqualityComparison(const Instruction *CxtI) {
+  for (const User *U : CxtI->users()) {
+    if (const ICmpInst *IC = dyn_cast<ICmpInst>(U))
+      if (IC->isEquality())
+        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
+          if (C->isNullValue())
+            continue;
+    return false;
+  }
+  return true;
+}
+
 static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth,
                                    const Query &Q);
 

Modified: llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp?rev=304313&r1=304312&r2=304313&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp (original)
+++ llvm/trunk/lib/CodeGen/CodeGenPrepare.cpp Wed May 31 12:12:38 2017
@@ -24,12 +24,13 @@
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -60,6 +61,7 @@
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -84,6 +86,12 @@ STATISTIC(NumDbgValueMoved, "Number of d
 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
 
+STATISTIC(NumMemCmpCalls, "Number of memcmp calls");
+STATISTIC(NumMemCmpNotConstant, "Number of memcmp calls without constant size");
+STATISTIC(NumMemCmpGreaterThanMax,
+          "Number of memcmp calls with size greater than max size");
+STATISTIC(NumMemCmpInlined, "Number of inlined memcmp calls");
+
 static cl::opt<bool> DisableBranchOpts(
   "disable-cgp-branch-opts", cl::Hidden, cl::init(false),
   cl::desc("Disable branch optimizations in CodeGenPrepare"));
@@ -144,6 +152,11 @@ EnableTypePromotionMerge("cgp-type-promo
     cl::desc("Enable merging of redundant sexts when one is dominating"
     " the other."), cl::init(true));
 
+static cl::opt<unsigned> MemCmpNumLoadsPerBlock(
+    "memcmp-num-loads-per-block", cl::Hidden, cl::init(1),
+    cl::desc("The number of loads per basic block for inline expansion of "
+             "memcmp that is only being compared against zero."));
+
 namespace {
 typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
 typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;
@@ -1629,6 +1642,593 @@ static bool despeculateCountZeros(Intrin
   return true;
 }
 
+// This class provides helper functions to expand a memcmp library call into an
+// inline expansion.
+class MemCmpExpansion {
+  struct ResultBlock {
+    BasicBlock *BB;
+    PHINode *PhiSrc1;
+    PHINode *PhiSrc2;
+    ResultBlock();
+  };
+
+  CallInst *CI;
+  ResultBlock ResBlock;
+  unsigned MaxLoadSize;
+  unsigned NumBlocks;
+  unsigned NumBlocksNonOneByte;
+  unsigned NumLoadsPerBlock;
+  std::vector<BasicBlock *> LoadCmpBlocks;
+  BasicBlock *EndBlock;
+  PHINode *PhiRes;
+  bool IsUsedForZeroCmp;
+  int calculateNumBlocks(unsigned Size);
+  void createLoadCmpBlocks();
+  void createResultBlock();
+  void setupResultBlockPHINodes();
+  void setupEndBlockPHINodes();
+  void emitLoadCompareBlock(unsigned Index, int LoadSize, int GEPIndex,
+                            bool IsLittleEndian);
+  void emitLoadCompareBlockMultipleLoads(unsigned Index, unsigned Size,
+                                         unsigned &NumBytesProcessed);
+  void emitLoadCompareByteBlock(unsigned Index, int GEPIndex);
+  void emitMemCmpResultBlock(bool IsLittleEndian);
+  Value *getMemCmpExpansionZeroCase(unsigned Size, bool IsLittleEndian);
+  unsigned getLoadSize(unsigned Size);
+  unsigned getNumLoads(unsigned Size);
+
+public:
+  MemCmpExpansion(CallInst *CI, unsigned MaxLoadSize,
+                  unsigned NumLoadsPerBlock);
+  Value *getMemCmpExpansion(bool IsLittleEndian);
+};
+
+MemCmpExpansion::ResultBlock::ResultBlock()
+    : BB(nullptr), PhiSrc1(nullptr), PhiSrc2(nullptr) {}
+
+// Initialize the basic block structure required for expansion of memcmp call
+// with given maximum load size and memcmp size parameter.
+// This structure includes:
+// 1. A list of load compare blocks - LoadCmpBlocks.
+// 2. An EndBlock, split from original instruction point, which is the block to
+// return from.
+// 3. ResultBlock, block to branch to for early exit when a
+// LoadCmpBlock finds a difference.
+MemCmpExpansion::MemCmpExpansion(CallInst *CI, unsigned MaxLoadSize,
+                                 unsigned NumLoadsPerBlock)
+    : CI(CI), MaxLoadSize(MaxLoadSize), NumLoadsPerBlock(NumLoadsPerBlock) {
+
+  IRBuilder<> Builder(CI->getContext());
+
+  BasicBlock *StartBlock = CI->getParent();
+  EndBlock = StartBlock->splitBasicBlock(CI, "endblock");
+  setupEndBlockPHINodes();
+  IsUsedForZeroCmp = isOnlyUsedInZeroEqualityComparison(CI);
+
+  ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  uint64_t Size = SizeCast->getZExtValue();
+
+  // Calculate how many load compare blocks are required for an expansion of
+  // given Size.
+  NumBlocks = calculateNumBlocks(Size);
+  createResultBlock();
+
+  // If return value of memcmp is not used in a zero equality, we need to
+  // calculate which source was larger. The calculation requires the
+  // two loaded source values of each load compare block.
+  // These will be saved in the phi nodes created by setupResultBlockPHINodes.
+  if (!IsUsedForZeroCmp)
+    setupResultBlockPHINodes();
+
+  // Create the number of required load compare basic blocks.
+  createLoadCmpBlocks();
+
+  // Update the terminator added by splitBasicBlock to branch to the first
+  // LoadCmpBlock.
+  Builder.SetCurrentDebugLocation(CI->getDebugLoc());
+  StartBlock->getTerminator()->setSuccessor(0, LoadCmpBlocks[0]);
+}
+
+void MemCmpExpansion::createLoadCmpBlocks() {
+  for (unsigned i = 0; i < NumBlocks; i++) {
+    BasicBlock *BB = BasicBlock::Create(CI->getContext(), "loadbb",
+                                        EndBlock->getParent(), EndBlock);
+    LoadCmpBlocks.push_back(BB);
+  }
+}
+
+void MemCmpExpansion::createResultBlock() {
+  ResBlock.BB = BasicBlock::Create(CI->getContext(), "res_block",
+                                   EndBlock->getParent(), EndBlock);
+}
+
+// This function creates the IR instructions for loading and comparing 1 byte.
+// It loads 1 byte from each source of the memcmp paramters with the given
+// GEPIndex. It then subtracts the two loaded values and adds this result to the
+// final phi node for selecting the memcmp result.
+void MemCmpExpansion::emitLoadCompareByteBlock(unsigned Index, int GEPIndex) {
+  IRBuilder<> Builder(CI->getContext());
+
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);
+
+  Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+  Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
+  // Cast source to LoadSizeType*
+  if (Source1->getType() != LoadSizeType)
+    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+  if (Source2->getType() != LoadSizeType)
+    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+  // Get the base address using the GEPIndex
+  if (GEPIndex != 0) {
+    Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+    Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+  }
+
+  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+  LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));
+  LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));
+  Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
+
+  PhiRes->addIncoming(Diff, LoadCmpBlocks[Index]);
+
+  if (Index < (LoadCmpBlocks.size() - 1)) {
+    // Early exit branch if difference found to EndBlock, otherwise continue to
+    // next LoadCmpBlock
+
+    Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
+                                    ConstantInt::get(Diff->getType(), 0));
+    BranchInst *CmpBr =
+        BranchInst::Create(EndBlock, LoadCmpBlocks[Index + 1], Cmp);
+    Builder.Insert(CmpBr);
+  } else {
+    // The last block has an unconditional branch to EndBlock
+    BranchInst *CmpBr = BranchInst::Create(EndBlock);
+    Builder.Insert(CmpBr);
+  }
+}
+
+unsigned MemCmpExpansion::getNumLoads(unsigned Size) {
+  return (Size / MaxLoadSize) + countPopulation(Size % MaxLoadSize);
+}
+
+unsigned MemCmpExpansion::getLoadSize(unsigned Size) {
+  return MinAlign(PowerOf2Floor(Size), MaxLoadSize);
+}
+
+void MemCmpExpansion::emitLoadCompareBlockMultipleLoads(
+    unsigned Index, unsigned Size, unsigned &NumBytesProcessed) {
+
+  IRBuilder<> Builder(CI->getContext());
+
+  std::vector<Value *> XorList, OrList;
+  Value *Diff;
+
+  unsigned RemainingBytes = Size - NumBytesProcessed;
+  unsigned NumLoadsRemaining = getNumLoads(RemainingBytes);
+  unsigned NumLoads = std::min(NumLoadsRemaining, NumLoadsPerBlock);
+
+  Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+
+  for (unsigned i = 0; i < NumLoads; ++i) {
+    unsigned LoadSize = getLoadSize(RemainingBytes);
+    unsigned GEPIndex = NumBytesProcessed / LoadSize;
+    NumBytesProcessed += LoadSize;
+    RemainingBytes -= LoadSize;
+
+    Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8);
+    Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+
+    Value *Source1 = CI->getArgOperand(0);
+    Value *Source2 = CI->getArgOperand(1);
+
+    // Cast source to LoadSizeType*
+    if (Source1->getType() != LoadSizeType)
+      Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+    if (Source2->getType() != LoadSizeType)
+      Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+    // Get the base address using the GEPIndex
+    if (GEPIndex != 0) {
+      Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+                                  ConstantInt::get(LoadSizeType, GEPIndex));
+      Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+                                  ConstantInt::get(LoadSizeType, GEPIndex));
+    }
+
+    // Load LoadSizeType from the base address
+    Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+    Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+    if (LoadSizeType != MaxLoadType) {
+      LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType);
+      LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType);
+    }
+    Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
+    Diff = Builder.CreateZExtOrTrunc(Diff, MaxLoadType);
+    XorList.push_back(Diff);
+  }
+
+  auto pairWiseOr = [&](std::vector<Value *> &InList) -> std::vector<Value *> {
+    std::vector<Value *> OutList;
+    for (unsigned i = 0; i < InList.size() - 1; i = i + 2) {
+      Value *Or = Builder.CreateOr(InList[i], InList[i + 1]);
+      OutList.push_back(Or);
+    }
+    if (InList.size() % 2 != 0)
+      OutList.push_back(InList.back());
+    return OutList;
+  };
+
+  // Pair wise OR the XOR results
+  OrList = pairWiseOr(XorList);
+
+  // Pair wise OR the OR results until one result left
+  while (OrList.size() != 1) {
+    OrList = pairWiseOr(OrList);
+  }
+
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, OrList[0],
+                                  ConstantInt::get(Diff->getType(), 0));
+  BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1))
+                           ? EndBlock
+                           : LoadCmpBlocks[Index + 1];
+  // Early exit branch if difference found to ResultBlock, otherwise continue to
+  // next LoadCmpBlock or EndBlock.
+  BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
+  Builder.Insert(CmpBr);
+
+  // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
+  // since early exit to ResultBlock was not taken (no difference was found in
+  // any of the bytes)
+  if (Index == LoadCmpBlocks.size() - 1) {
+    Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
+    PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]);
+  }
+}
+
+// This function creates the IR intructions for loading and comparing using the
+// given LoadSize. It loads the number of bytes specified by LoadSize from each
+// source of the memcmp parameters. It then does a subtract to see if there was
+// a difference in the loaded values. If a difference is found, it branches
+// with an early exit to the ResultBlock for calculating which source was
+// larger. Otherwise, it falls through to the either the next LoadCmpBlock or
+// the EndBlock if this is the last LoadCmpBlock. Loading 1 byte is handled with
+// a special case through emitLoadCompareByteBlock. The special handling can
+// simply subtract the loaded values and add it to the result phi node.
+void MemCmpExpansion::emitLoadCompareBlock(unsigned Index, int LoadSize,
+                                           int GEPIndex, bool IsLittleEndian) {
+  if (LoadSize == 1) {
+    MemCmpExpansion::emitLoadCompareByteBlock(Index, GEPIndex);
+    return;
+  }
+
+  IRBuilder<> Builder(CI->getContext());
+
+  Type *LoadSizeType = IntegerType::get(CI->getContext(), LoadSize * 8);
+  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+
+  Value *Source1 = CI->getArgOperand(0);
+  Value *Source2 = CI->getArgOperand(1);
+
+  Builder.SetInsertPoint(LoadCmpBlocks[Index]);
+  // Cast source to LoadSizeType*
+  if (Source1->getType() != LoadSizeType)
+    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
+  if (Source2->getType() != LoadSizeType)
+    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
+
+  // Get the base address using the GEPIndex
+  if (GEPIndex != 0) {
+    Source1 = Builder.CreateGEP(LoadSizeType, Source1,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+    Source2 = Builder.CreateGEP(LoadSizeType, Source2,
+                                ConstantInt::get(LoadSizeType, GEPIndex));
+  }
+
+  // Load LoadSizeType from the base address
+  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
+  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+
+  if (IsLittleEndian) {
+    Function *F = LoadCmpBlocks[Index]->getParent();
+
+    Function *Bswap = Intrinsic::getDeclaration(F->getParent(),
+                                                Intrinsic::bswap, LoadSizeType);
+    LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
+    LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
+  }
+
+  if (LoadSizeType != MaxLoadType) {
+    LoadSrc1 = Builder.CreateZExtOrTrunc(LoadSrc1, MaxLoadType);
+    LoadSrc2 = Builder.CreateZExtOrTrunc(LoadSrc2, MaxLoadType);
+  }
+
+  // Add the loaded values to the phi nodes for calculating memcmp result only
+  // if result is not used in a zero equality.
+  if (!IsUsedForZeroCmp) {
+    ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[Index]);
+    ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[Index]);
+  }
+
+  Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
+
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_NE, Diff,
+                                  ConstantInt::get(Diff->getType(), 0));
+  BasicBlock *NextBB = (Index == (LoadCmpBlocks.size() - 1))
+                           ? EndBlock
+                           : LoadCmpBlocks[Index + 1];
+  // Early exit branch if difference found to ResultBlock, otherwise continue to
+  // next LoadCmpBlock or EndBlock.
+  BranchInst *CmpBr = BranchInst::Create(ResBlock.BB, NextBB, Cmp);
+  Builder.Insert(CmpBr);
+
+  // Add a phi edge for the last LoadCmpBlock to Endblock with a value of 0
+  // since early exit to ResultBlock was not taken (no difference was found in
+  // any of the bytes)
+  if (Index == LoadCmpBlocks.size() - 1) {
+    Value *Zero = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 0);
+    PhiRes->addIncoming(Zero, LoadCmpBlocks[Index]);
+  }
+}
+
+// This function populates the ResultBlock with a sequence to calculate the
+// memcmp result. It compares the two loaded source values and returns -1 if
+// src1 < src2 and 1 if src1 > src2.
+void MemCmpExpansion::emitMemCmpResultBlock(bool IsLittleEndian) {
+  IRBuilder<> Builder(CI->getContext());
+
+  // Special case: if memcmp result is used in a zero equality, result does not
+  // need to be calculated and can simply return 1.
+  if (IsUsedForZeroCmp) {
+    BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
+    Builder.SetInsertPoint(ResBlock.BB, InsertPt);
+    Value *Res = ConstantInt::get(Type::getInt32Ty(CI->getContext()), 1);
+    PhiRes->addIncoming(Res, ResBlock.BB);
+    BranchInst *NewBr = BranchInst::Create(EndBlock);
+    Builder.Insert(NewBr);
+    return;
+  }
+  BasicBlock::iterator InsertPt = ResBlock.BB->getFirstInsertionPt();
+  Builder.SetInsertPoint(ResBlock.BB, InsertPt);
+
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_ULT, ResBlock.PhiSrc1,
+                                  ResBlock.PhiSrc2);
+
+  Value *Res =
+      Builder.CreateSelect(Cmp, ConstantInt::get(Builder.getInt32Ty(), -1),
+                           ConstantInt::get(Builder.getInt32Ty(), 1));
+
+  BranchInst *NewBr = BranchInst::Create(EndBlock);
+  Builder.Insert(NewBr);
+  PhiRes->addIncoming(Res, ResBlock.BB);
+}
+
+int MemCmpExpansion::calculateNumBlocks(unsigned Size) {
+  int NumBlocks = 0;
+  bool haveOneByteLoad = false;
+  unsigned RemainingSize = Size;
+  unsigned LoadSize = MaxLoadSize;
+  while (RemainingSize) {
+    if (LoadSize == 1)
+      haveOneByteLoad = true;
+    NumBlocks += RemainingSize / LoadSize;
+    RemainingSize = RemainingSize % LoadSize;
+    LoadSize = LoadSize / 2;
+  }
+  NumBlocksNonOneByte = haveOneByteLoad ? (NumBlocks - 1) : NumBlocks;
+
+  if (IsUsedForZeroCmp)
+    NumBlocks = NumBlocks / NumLoadsPerBlock +
+                (NumBlocks % NumLoadsPerBlock != 0 ? 1 : 0);
+
+  return NumBlocks;
+}
+
+void MemCmpExpansion::setupResultBlockPHINodes() {
+  IRBuilder<> Builder(CI->getContext());
+  Type *MaxLoadType = IntegerType::get(CI->getContext(), MaxLoadSize * 8);
+  Builder.SetInsertPoint(ResBlock.BB);
+  ResBlock.PhiSrc1 =
+      Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src1");
+  ResBlock.PhiSrc2 =
+      Builder.CreatePHI(MaxLoadType, NumBlocksNonOneByte, "phi.src2");
+}
+
+void MemCmpExpansion::setupEndBlockPHINodes() {
+  IRBuilder<> Builder(CI->getContext());
+
+  Builder.SetInsertPoint(&EndBlock->front());
+  PhiRes = Builder.CreatePHI(Type::getInt32Ty(CI->getContext()), 2, "phi.res");
+}
+
+Value *MemCmpExpansion::getMemCmpExpansionZeroCase(unsigned Size,
+                                                   bool IsLittleEndian) {
+  unsigned NumBytesProcessed = 0;
+  // This loop populates each of the LoadCmpBlocks with IR sequence to handle
+  // multiple loads per block
+  for (unsigned i = 0; i < NumBlocks; ++i) {
+    emitLoadCompareBlockMultipleLoads(i, Size, NumBytesProcessed);
+  }
+
+  emitMemCmpResultBlock(IsLittleEndian);
+  return PhiRes;
+}
+
+// This function expands the memcmp call into an inline expansion and returns
+// the memcmp result.
+Value *MemCmpExpansion::getMemCmpExpansion(bool IsLittleEndian) {
+
+  ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  uint64_t Size = SizeCast->getZExtValue();
+
+  int LoadSize = MaxLoadSize;
+  int NumBytesToBeProcessed = Size;
+
+  if (IsUsedForZeroCmp) {
+    return getMemCmpExpansionZeroCase(Size, IsLittleEndian);
+  }
+
+  unsigned Index = 0;
+  // This loop calls emitLoadCompareBlock for comparing SizeVal bytes of the two
+  // memcmp source. It starts with loading using the maximum load size set by
+  // the target. It processes any remaining bytes using a load size which is the
+  // next smallest power of 2.
+  while (NumBytesToBeProcessed) {
+    // Calculate how many blocks we can create with the current load size
+    int NumBlocks = NumBytesToBeProcessed / LoadSize;
+    int GEPIndex = (Size - NumBytesToBeProcessed) / LoadSize;
+    NumBytesToBeProcessed = NumBytesToBeProcessed % LoadSize;
+
+    // For each NumBlocks, populate the instruction sequence for loading and
+    // comparing LoadSize bytes
+    while (NumBlocks--) {
+      emitLoadCompareBlock(Index, LoadSize, GEPIndex, IsLittleEndian);
+      Index++;
+      GEPIndex++;
+    }
+    // Get the next LoadSize to use
+    LoadSize = LoadSize / 2;
+  }
+
+  emitMemCmpResultBlock(IsLittleEndian);
+  return PhiRes;
+}
+
+// This function checks to see if an expansion of memcmp can be generated.
+// It checks for constant compare size that is less than the max inline size.
+// If an expansion cannot occur, returns false to leave as a library call.
+// Otherwise, the library call is replaced wtih new IR instruction sequence.
+/// We want to transform:
+/// %call = call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
+/// To:
+/// loadbb:
+///  %0 = bitcast i32* %buffer2 to i8*
+///  %1 = bitcast i32* %buffer1 to i8*
+///  %2 = bitcast i8* %1 to i64*
+///  %3 = bitcast i8* %0 to i64*
+///  %4 = load i64, i64* %2
+///  %5 = load i64, i64* %3
+///  %6 = call i64 @llvm.bswap.i64(i64 %4)
+///  %7 = call i64 @llvm.bswap.i64(i64 %5)
+///  %8 = sub i64 %6, %7
+///  %9 = icmp ne i64 %8, 0
+///  br i1 %9, label %res_block, label %loadbb1
+/// res_block:                                        ; preds = %loadbb2,
+/// %loadbb1, %loadbb
+///  %phi.src1 = phi i64 [ %6, %loadbb ], [ %22, %loadbb1 ], [ %36, %loadbb2 ]
+///  %phi.src2 = phi i64 [ %7, %loadbb ], [ %23, %loadbb1 ], [ %37, %loadbb2 ]
+///  %10 = icmp ult i64 %phi.src1, %phi.src2
+///  %11 = select i1 %10, i32 -1, i32 1
+///  br label %endblock
+/// loadbb1:                                          ; preds = %loadbb
+///  %12 = bitcast i32* %buffer2 to i8*
+///  %13 = bitcast i32* %buffer1 to i8*
+///  %14 = bitcast i8* %13 to i32*
+///  %15 = bitcast i8* %12 to i32*
+///  %16 = getelementptr i32, i32* %14, i32 2
+///  %17 = getelementptr i32, i32* %15, i32 2
+///  %18 = load i32, i32* %16
+///  %19 = load i32, i32* %17
+///  %20 = call i32 @llvm.bswap.i32(i32 %18)
+///  %21 = call i32 @llvm.bswap.i32(i32 %19)
+///  %22 = zext i32 %20 to i64
+///  %23 = zext i32 %21 to i64
+///  %24 = sub i64 %22, %23
+///  %25 = icmp ne i64 %24, 0
+///  br i1 %25, label %res_block, label %loadbb2
+/// loadbb2:                                          ; preds = %loadbb1
+///  %26 = bitcast i32* %buffer2 to i8*
+///  %27 = bitcast i32* %buffer1 to i8*
+///  %28 = bitcast i8* %27 to i16*
+///  %29 = bitcast i8* %26 to i16*
+///  %30 = getelementptr i16, i16* %28, i16 6
+///  %31 = getelementptr i16, i16* %29, i16 6
+///  %32 = load i16, i16* %30
+///  %33 = load i16, i16* %31
+///  %34 = call i16 @llvm.bswap.i16(i16 %32)
+///  %35 = call i16 @llvm.bswap.i16(i16 %33)
+///  %36 = zext i16 %34 to i64
+///  %37 = zext i16 %35 to i64
+///  %38 = sub i64 %36, %37
+///  %39 = icmp ne i64 %38, 0
+///  br i1 %39, label %res_block, label %loadbb3
+/// loadbb3:                                          ; preds = %loadbb2
+///  %40 = bitcast i32* %buffer2 to i8*
+///  %41 = bitcast i32* %buffer1 to i8*
+///  %42 = getelementptr i8, i8* %41, i8 14
+///  %43 = getelementptr i8, i8* %40, i8 14
+///  %44 = load i8, i8* %42
+///  %45 = load i8, i8* %43
+///  %46 = zext i8 %44 to i32
+///  %47 = zext i8 %45 to i32
+///  %48 = sub i32 %46, %47
+///  br label %endblock
+/// endblock:                                         ; preds = %res_block,
+/// %loadbb3
+///  %phi.res = phi i32 [ %48, %loadbb3 ], [ %11, %res_block ]
+///  ret i32 %phi.res
+static bool expandMemCmp(CallInst *CI, const TargetTransformInfo *TTI,
+                         const TargetLowering *TLI, const DataLayout *DL) {
+  NumMemCmpCalls++;
+  IRBuilder<> Builder(CI->getContext());
+
+  // TTI call to check if target would like to expand memcmp and get the
+  // MaxLoadSize
+  unsigned MaxLoadSize;
+  if (!TTI->expandMemCmp(CI, MaxLoadSize))
+    return false;
+
+  // Early exit from expansion if -Oz
+  if (CI->getParent()->getParent()->optForMinSize()) {
+    return false;
+  }
+
+  // Early exit from expansion if size is not a constant
+  ConstantInt *SizeCast = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+  if (!SizeCast) {
+    NumMemCmpNotConstant++;
+    return false;
+  }
+
+  // Early exit from expansion if size greater than max bytes to load
+  uint64_t SizeVal = SizeCast->getZExtValue();
+
+  unsigned NumLoads = 0;
+  unsigned RemainingSize = SizeVal;
+  unsigned LoadSize = MaxLoadSize;
+  while (RemainingSize) {
+    NumLoads += RemainingSize / LoadSize;
+    RemainingSize = RemainingSize % LoadSize;
+    LoadSize = LoadSize / 2;
+  }
+
+  if (NumLoads >
+      TLI->getMaxExpandSizeMemcmp(CI->getParent()->getParent()->optForSize())) {
+    NumMemCmpGreaterThanMax++;
+    return false;
+  }
+
+  NumMemCmpInlined++;
+
+  // MemCmpHelper object, creates and sets up basic blocks required for
+  // expanding memcmp with size SizeVal
+  unsigned NumLoadsPerBlock = MemCmpNumLoadsPerBlock;
+  MemCmpExpansion MemCmpHelper(CI, MaxLoadSize, NumLoadsPerBlock);
+
+  Value *Res = MemCmpHelper.getMemCmpExpansion(DL->isLittleEndian());
+
+  // Replace call with result of expansion and erarse call.
+  CI->replaceAllUsesWith(Res);
+  CI->eraseFromParent();
+
+  return true;
+}
+
 bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
   BasicBlock *BB = CI->getParent();
 
@@ -1780,6 +2380,15 @@ bool CodeGenPrepare::optimizeCallInst(Ca
     CI->eraseFromParent();
     return true;
   }
+
+  LibFunc Func;
+  if (TLInfo->getLibFunc(*CI->getCalledFunction(), Func) &&
+      Func == LibFunc_memcmp) {
+    if (expandMemCmp(CI, TTI, TLI, DL)) {
+      ModifiedDT = true;
+      return true;
+    }
+  }
   return false;
 }
 
@@ -4927,6 +5536,7 @@ bool CodeGenPrepare::optimizeSwitchInst(
   return true;
 }
 
+
 namespace {
 /// \brief Helper class to promote a scalar operation to a vector one.
 /// This class is used to move downward extractelement transition.

Modified: llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp?rev=304313&r1=304312&r2=304313&view=diff
==============================================================================
--- llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp (original)
+++ llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp Wed May 31 12:12:38 2017
@@ -842,9 +842,10 @@ TargetLoweringBase::TargetLoweringBase(c
   initActions();
 
   // Perform these initializations only once.
-  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove = 8;
-  MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize
-    = MaxStoresPerMemmoveOptSize = 4;
+  MaxStoresPerMemset = MaxStoresPerMemcpy = MaxStoresPerMemmove =
+      MaxLoadsPerMemcmp = 8;
+  MaxStoresPerMemsetOptSize = MaxStoresPerMemcpyOptSize =
+      MaxStoresPerMemmoveOptSize = MaxLoadsPerMemcmpOptSize = 4;
   UseUnderscoreSetJmp = false;
   UseUnderscoreLongJmp = false;
   HasMultipleConditionRegisters = false;

Modified: llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp?rev=304313&r1=304312&r2=304313&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCISelLowering.cpp Wed May 31 12:12:38 2017
@@ -1041,6 +1041,10 @@ PPCTargetLowering::PPCTargetLowering(con
     MaxStoresPerMemset = 128;
     MaxStoresPerMemcpy = 128;
     MaxStoresPerMemmove = 128;
+    MaxLoadsPerMemcmp = 128;
+  } else {
+    MaxLoadsPerMemcmp = 8;
+    MaxLoadsPerMemcmpOptSize = 4;
   }
 }
 

Modified: llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp?rev=304313&r1=304312&r2=304313&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.cpp Wed May 31 12:12:38 2017
@@ -215,6 +215,11 @@ bool PPCTTIImpl::enableAggressiveInterle
   return LoopHasReductions;
 }
 
+bool PPCTTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
+  MaxLoadSize = 8;
+  return true;
+}
+
 bool PPCTTIImpl::enableInterleavedAccessVectorization() {
   return true;
 }

Modified: llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h?rev=304313&r1=304312&r2=304313&view=diff
==============================================================================
--- llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h (original)
+++ llvm/trunk/lib/Target/PowerPC/PPCTargetTransformInfo.h Wed May 31 12:12:38 2017
@@ -60,6 +60,7 @@ public:
   /// @{
 
   bool enableAggressiveInterleaving(bool LoopHasReductions);
+  bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
   bool enableInterleavedAccessVectorization();
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);

Modified: llvm/trunk/lib/Transforms/Utils/SimplifyLibCalls.cpp
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/lib/Transforms/Utils/SimplifyLibCalls.cpp?rev=304313&r1=304312&r2=304313&view=diff
==============================================================================
--- llvm/trunk/lib/Transforms/Utils/SimplifyLibCalls.cpp (original)
+++ llvm/trunk/lib/Transforms/Utils/SimplifyLibCalls.cpp Wed May 31 12:12:38 2017
@@ -85,20 +85,6 @@ static bool isCallingConvCCompatible(Cal
   return false;
 }
 
-/// Return true if it only matters that the value is equal or not-equal to zero.
-static bool isOnlyUsedInZeroEqualityComparison(Value *V) {
-  for (User *U : V->users()) {
-    if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
-      if (IC->isEquality())
-        if (Constant *C = dyn_cast<Constant>(IC->getOperand(1)))
-          if (C->isNullValue())
-            continue;
-    // Unknown instruction.
-    return false;
-  }
-  return true;
-}
-
 /// Return true if it is only used in equality comparisons with With.
 static bool isOnlyUsedInEqualityComparison(Value *V, Value *With) {
   for (User *U : V->users()) {

Added: llvm/trunk/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll?rev=304313&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll (added)
+++ llvm/trunk/test/CodeGen/PowerPC/memCmpUsedInZeroEqualityComparison.ll Wed May 31 12:12:38 2017
@@ -0,0 +1,121 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+ at zeroEqualityTest01.buffer1 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 4], align 4
+ at zeroEqualityTest01.buffer2 = private unnamed_addr constant [3 x i32] [i32 1, i32 2, i32 3], align 4
+ at zeroEqualityTest02.buffer1 = private unnamed_addr constant [4 x i32] [i32 4, i32 0, i32 0, i32 0], align 4
+ at zeroEqualityTest02.buffer2 = private unnamed_addr constant [4 x i32] [i32 3, i32 0, i32 0, i32 0], align 4
+ at zeroEqualityTest03.buffer1 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 3], align 4
+ at zeroEqualityTest03.buffer2 = private unnamed_addr constant [4 x i32] [i32 0, i32 0, i32 0, i32 4], align 4
+ at zeroEqualityTest04.buffer1 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14], align 4
+ at zeroEqualityTest04.buffer2 = private unnamed_addr constant [15 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 13], align 4
+
+; Function Attrs: nounwind readonly
+declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1
+
+; Validate with if(memcmp())
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest01() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer1 to i8*), i8* bitcast ([3 x i32]* @zeroEqualityTest01.buffer2 to i8*), i64 16)
+  %not.tobool = icmp ne i32 %call, 0
+  %. = zext i1 %not.tobool to i32
+  ret i32 %.
+
+  ; CHECK-LABEL: @zeroEqualityTest01
+  ; CHECK-LABEL: %res_block
+  ; CHECK: li 3, 1
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+  ; CHECK: li 3, 0
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+}
+
+; Validate with if(memcmp() == 0)
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest02() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16)
+  %not.cmp = icmp ne i32 %call, 0
+  %. = zext i1 %not.cmp to i32
+  ret i32 %.
+
+  ; CHECK-LABEL: @zeroEqualityTest02
+  ; CHECK-LABEL: %res_block
+  ; CHECK: li 3, 1
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+  ; CHECK: li 3, 0
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+}
+
+; Validate with > 0
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest03() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest02.buffer2 to i8*), i64 16)
+  %not.cmp = icmp slt i32 %call, 1
+  %. = zext i1 %not.cmp to i32
+  ret i32 %.
+
+  ; CHECK-LABEL: @zeroEqualityTest03
+  ; CHECK-LABEL: %res_block
+  ; CHECK: cmpld
+  ; CHECK-NEXT: li [[LI:[0-9]+]], 1
+  ; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+  ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0
+}
+
+; Validate with < 0
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest04() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer1 to i8*), i8* bitcast ([4 x i32]* @zeroEqualityTest03.buffer2 to i8*), i64 16)
+  %call.lobit = lshr i32 %call, 31
+  %call.lobit.not = xor i32 %call.lobit, 1
+  ret i32 %call.lobit.not
+
+  ; CHECK-LABEL: @zeroEqualityTest04
+  ; CHECK-LABEL: %res_block
+  ; CHECK: cmpld
+  ; CHECK-NEXT: li [[LI:[0-9]+]], 1
+  ; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+  ; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 0
+}
+
+; Validate with memcmp()?:
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest05() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16)
+  %not.tobool = icmp eq i32 %call, 0
+  %cond = zext i1 %not.tobool to i32
+  ret i32 %cond
+
+  ; CHECK-LABEL: @zeroEqualityTest05
+  ; CHECK-LABEL: %res_block
+  ; CHECK: li 3, 1
+  ; CHECK: li 3, 0
+}
+
+; Validate with !memcmp()?:
+; Function Attrs: nounwind readonly
+define signext i32 @zeroEqualityTest06() local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @memcmp(i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer1 to i8*), i8* bitcast ([15 x i32]* @zeroEqualityTest04.buffer2 to i8*), i64 16)
+  %not.lnot = icmp ne i32 %call, 0
+  %cond = zext i1 %not.lnot to i32
+  ret i32 %cond
+
+  ; CHECK-LABEL: @zeroEqualityTest06
+  ; CHECK-LABEL: %res_block
+  ; CHECK: li 3, 1
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+  ; CHECK: li 3, 0
+  ; CHECK-NEXT: clrldi
+  ; CHECK-NEXT: blr
+}

Added: llvm/trunk/test/CodeGen/PowerPC/memcmp.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/memcmp.ll?rev=304313&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/memcmp.ll (added)
+++ llvm/trunk/test/CodeGen/PowerPC/memcmp.ll Wed May 31 12:12:38 2017
@@ -0,0 +1,87 @@
+; RUN: llc -verify-machineinstrs -mcpu=pwr8 -mtriple=powerpc64le-unknown-gnu-linux  < %s | FileCheck %s -check-prefix=CHECK
+
+; Check size 8
+; Function Attrs: nounwind readonly
+define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 8) #2
+  ret i32 %call
+
+; CHECK-LABEL: @test1
+; CHECK: ldbrx [[LOAD1:[0-9]+]]
+; CHECK-NEXT: ldbrx [[LOAD2:[0-9]+]]
+; CHECK-NEXT: li [[LI:[0-9]+]], 1
+; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4
+; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2
+; CHECK-NEXT: extsw 3, [[ISEL2]]
+; CHECK-NEXT: blr
+}
+
+; Check size 4
+; Function Attrs: nounwind readonly
+define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4) #2
+  ret i32 %call
+
+; CHECK-LABEL: @test2
+; CHECK: lwbrx [[LOAD1:[0-9]+]]
+; CHECK-NEXT: lwbrx [[LOAD2:[0-9]+]]
+; CHECK-NEXT: li [[LI:[0-9]+]], 1
+; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4
+; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2
+; CHECK-NEXT: extsw 3, [[ISEL2]]
+; CHECK-NEXT: blr
+}
+
+; Check size 2
+; Function Attrs: nounwind readonly
+define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 2) #2
+  ret i32 %call
+
+; CHECK-LABEL: @test3
+; CHECK: lhbrx [[LOAD1:[0-9]+]]
+; CHECK-NEXT: lhbrx [[LOAD2:[0-9]+]]
+; CHECK-NEXT: li [[LI:[0-9]+]], 1
+; CHECK-NEXT: cmpld [[CMPLD:[0-9]+]], [[LOAD1]], [[LOAD2]]
+; CHECK-NEXT: subf. [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: li [[LI2:[0-9]+]], -1
+; CHECK-NEXT: isel [[ISEL:[0-9]+]], [[LI2]], [[LI]], 4
+; CHECK-NEXT: isel [[ISEL2:[0-9]+]], 0, [[ISEL]], 2
+; CHECK-NEXT: extsw 3, [[ISEL2]]
+; CHECK-NEXT: blr
+}
+
+; Check size 1
+; Function Attrs: nounwind readonly
+define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2) local_unnamed_addr #0 {
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 1) #2
+  ret i32 %call
+
+; CHECK-LABEL: @test4
+; CHECK: lbz [[LOAD1:[0-9]+]]
+; CHECK-NEXT: lbz [[LOAD2:[0-9]+]]
+; CHECK-NEXT: subf [[SUB:[0-9]+]], [[LOAD2]], [[LOAD1]]
+; CHECK-NEXT: extsw 3, [[SUB]]
+; CHECK-NEXT: blr
+}
+
+; Function Attrs: nounwind readonly
+declare signext i32 @memcmp(i8*, i8*, i64) #1

Added: llvm/trunk/test/CodeGen/PowerPC/memcmpIR.ll
URL: http://llvm.org/viewvc/llvm-project/llvm/trunk/test/CodeGen/PowerPC/memcmpIR.ll?rev=304313&view=auto
==============================================================================
--- llvm/trunk/test/CodeGen/PowerPC/memcmpIR.ll (added)
+++ llvm/trunk/test/CodeGen/PowerPC/memcmpIR.ll Wed May 31 12:12:38 2017
@@ -0,0 +1,194 @@
+; RUN: llc -o - -mtriple=powerpc64le-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s
+; RUN: llc -o - -mtriple=powerpc64-unknown-gnu-linux -stop-after codegenprepare %s | FileCheck %s --check-prefix=CHECK-BE
+
+define signext i32 @test1(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2)  {
+entry:
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
+  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK-LABEL: res_block:{{.*}}
+  ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
+  ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+  ; CHECK-NEXT: br label %endblock
+
+  ; CHECK: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+  ; CHECK-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+  ; CHECK-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]]
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]]
+  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
+  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-NEXT:  br i1 [[ICMP]], label %res_block, label %endblock
+
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK-BE-LABEL: res_block:{{.*}}
+  ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
+  ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+  ; CHECK-BE-NEXT: br label %endblock
+
+  ; CHECK-BE: [[GEP1:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+  ; CHECK-BE-NEXT: [[GEP2:%[0-9]+]] = getelementptr i64, i64* {{.*}}, i64 1
+  ; CHECK-BE-NEXT: [[LOAD1:%[0-9]+]] = load i64, i64* [[GEP1]]
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64* [[GEP2]]
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %res_block, label %endblock
+
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 16)
+  ret i32 %call
+}
+
+declare signext i32 @memcmp(i8* nocapture, i8* nocapture, i64) local_unnamed_addr #1
+
+define signext i32 @test2(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2)  {
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
+  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
+  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64
+  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-NEXT:  br i1 [[ICMP]], label %res_block, label %endblock
+
+  ; CHECK-LABEL: res_block:{{.*}}
+  ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
+  ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+  ; CHECK-NEXT: br label %endblock
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %res_block, label %endblock
+
+  ; CHECK-BE-LABEL: res_block:{{.*}}
+  ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
+  ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+  ; CHECK-BE-NEXT: br label %endblock
+
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 4)
+  ret i32 %call
+}
+
+define signext i32 @test3(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2)  {
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i64, i64*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD1]])
+  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i64 @llvm.bswap.i64(i64 [[LOAD2]])
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[BSWAP1]], [[BSWAP2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK-LABEL: res_block:{{.*}}
+  ; CHECK: [[ICMP2:%[0-9]+]] = icmp ult i64
+  ; CHECK-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+  ; CHECK-NEXT: br label %endblock
+
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i32, i32*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD1]])
+  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i32 @llvm.bswap.i32(i32 [[LOAD2]])
+  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[BSWAP1]] to i64
+  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[BSWAP2]] to i64
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i16, i16*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16*
+  ; CHECK-NEXT: [[BSWAP1:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD1]])
+  ; CHECK-NEXT: [[BSWAP2:%[0-9]+]] = call i16 @llvm.bswap.i16(i16 [[LOAD2]])
+  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[BSWAP1]] to i64
+  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[BSWAP2]] to i64
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK: [[LOAD1:%[0-9]+]] = load i8, i8*
+  ; CHECK-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8*
+  ; CHECK-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
+  ; CHECK-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
+  ; CHECK-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-NEXT:  br label %endblock
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i64, i64*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i64, i64*
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[LOAD1]], [[LOAD2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK-BE-LABEL: res_block:{{.*}}
+  ; CHECK-BE: [[ICMP2:%[0-9]+]] = icmp ult i64
+  ; CHECK-BE-NEXT: [[SELECT:%[0-9]+]] = select i1 [[ICMP2]], i32 -1, i32 1
+  ; CHECK-BE-NEXT: br label %endblock
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i32, i32*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i32, i32*
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i32 [[LOAD1]] to i64
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i32 [[LOAD2]] to i64
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i16, i16*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i16, i16*
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i16 [[LOAD1]] to i64
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i16 [[LOAD2]] to i64
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i64 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT: [[ICMP:%[0-9]+]] = icmp ne i64 [[SUB]], 0
+  ; CHECK-BE-NEXT:  br i1 [[ICMP]], label %res_block, label
+
+  ; CHECK-BE: [[LOAD1:%[0-9]+]] = load i8, i8*
+  ; CHECK-BE-NEXT: [[LOAD2:%[0-9]+]] = load i8, i8*
+  ; CHECK-BE-NEXT: [[ZEXT1:%[0-9]+]] = zext i8 [[LOAD1]] to i32
+  ; CHECK-BE-NEXT: [[ZEXT2:%[0-9]+]] = zext i8 [[LOAD2]] to i32
+  ; CHECK-BE-NEXT: [[SUB:%[0-9]+]] = sub i32 [[ZEXT1]], [[ZEXT2]]
+  ; CHECK-BE-NEXT:  br label %endblock
+
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 15)
+  ret i32 %call
+}
+  ; CHECK: call = tail call signext i32 @memcmp
+  ; CHECK-BE: call = tail call signext i32 @memcmp
+define signext i32 @test4(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2)  {
+
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 65)
+  ret i32 %call
+}
+
+define signext i32 @test5(i32* nocapture readonly %buffer1, i32* nocapture readonly %buffer2, i32 signext %SIZE)  {
+  ; CHECK: call = tail call signext i32 @memcmp
+  ; CHECK-BE: call = tail call signext i32 @memcmp
+entry:
+  %0 = bitcast i32* %buffer1 to i8*
+  %1 = bitcast i32* %buffer2 to i8*
+  %conv = sext i32 %SIZE to i64
+  %call = tail call signext i32 @memcmp(i8* %0, i8* %1, i64 %conv)
+  ret i32 %call
+}